diff --git a/programs/develop/libraries/newlib/sdk/fasm/include/pixman-1.inc b/programs/develop/libraries/newlib/sdk/fasm/include/pixman-1.inc
new file mode 100644
index 0000000000..a59c2b2679
--- /dev/null
+++ b/programs/develop/libraries/newlib/sdk/fasm/include/pixman-1.inc
@@ -0,0 +1,148 @@
+import pixman-1,\
+    _pixman_internal_only_get_implementation,'_pixman_internal_only_get_implementation',\
+    pixman_add_trapezoids,'pixman_add_trapezoids',\
+    pixman_add_traps,'pixman_add_traps',\
+    pixman_add_triangles,'pixman_add_triangles',\
+    pixman_blt,'pixman_blt',\
+    pixman_composite_glyphs,'pixman_composite_glyphs',\
+    pixman_composite_glyphs_no_mask,'pixman_composite_glyphs_no_mask',\
+    pixman_composite_trapezoids,'pixman_composite_trapezoids',\
+    pixman_composite_triangles,'pixman_composite_triangles',\
+    pixman_compute_composite_region,'pixman_compute_composite_region',\
+    pixman_disable_out_of_bounds_workaround,'pixman_disable_out_of_bounds_workaround',\
+    pixman_edge_init,'pixman_edge_init',\
+    pixman_edge_step,'pixman_edge_step',\
+    pixman_f_transform_bounds,'pixman_f_transform_bounds',\
+    pixman_f_transform_from_pixman_transform,'pixman_f_transform_from_pixman_transform',\
+    pixman_f_transform_init_identity,'pixman_f_transform_init_identity',\
+    pixman_f_transform_init_rotate,'pixman_f_transform_init_rotate',\
+    pixman_f_transform_init_scale,'pixman_f_transform_init_scale',\
+    pixman_f_transform_init_translate,'pixman_f_transform_init_translate',\
+    pixman_f_transform_invert,'pixman_f_transform_invert',\
+    pixman_f_transform_multiply,'pixman_f_transform_multiply',\
+    pixman_f_transform_point,'pixman_f_transform_point',\
+    pixman_f_transform_point_3d,'pixman_f_transform_point_3d',\
+    pixman_f_transform_rotate,'pixman_f_transform_rotate',\
+    pixman_f_transform_scale,'pixman_f_transform_scale',\
+    pixman_f_transform_translate,'pixman_f_transform_translate',\
+    pixman_fill,'pixman_fill',\
+    pixman_filter_create_separable_convolution,'pixman_filter_create_separable_convolution',\
+    pixman_format_supported_destination,'pixman_format_supported_destination',\
+    pixman_format_supported_source,'pixman_format_supported_source',\
+    pixman_glyph_cache_create,'pixman_glyph_cache_create',\
+    pixman_glyph_cache_destroy,'pixman_glyph_cache_destroy',\
+    pixman_glyph_cache_freeze,'pixman_glyph_cache_freeze',\
+    pixman_glyph_cache_insert,'pixman_glyph_cache_insert',\
+    pixman_glyph_cache_lookup,'pixman_glyph_cache_lookup',\
+    pixman_glyph_cache_remove,'pixman_glyph_cache_remove',\
+    pixman_glyph_cache_thaw,'pixman_glyph_cache_thaw',\
+    pixman_glyph_get_extents,'pixman_glyph_get_extents',\
+    pixman_glyph_get_mask_format,'pixman_glyph_get_mask_format',\
+    pixman_image_composite,'pixman_image_composite',\
+    pixman_image_composite32,'pixman_image_composite32',\
+    pixman_image_create_bits,'pixman_image_create_bits',\
+    pixman_image_create_bits_no_clear,'pixman_image_create_bits_no_clear',\
+    pixman_image_create_conical_gradient,'pixman_image_create_conical_gradient',\
+    pixman_image_create_linear_gradient,'pixman_image_create_linear_gradient',\
+    pixman_image_create_radial_gradient,'pixman_image_create_radial_gradient',\
+    pixman_image_create_solid_fill,'pixman_image_create_solid_fill',\
+    pixman_image_fill_boxes,'pixman_image_fill_boxes',\
+    pixman_image_fill_rectangles,'pixman_image_fill_rectangles',\
+    pixman_image_get_component_alpha,'pixman_image_get_component_alpha',\
+    pixman_image_get_data,'pixman_image_get_data',\
+    pixman_image_get_depth,'pixman_image_get_depth',\
+    pixman_image_get_destroy_data,'pixman_image_get_destroy_data',\
+    pixman_image_get_format,'pixman_image_get_format',\
+    pixman_image_get_height,'pixman_image_get_height',\
+    pixman_image_get_stride,'pixman_image_get_stride',\
+    pixman_image_get_width,'pixman_image_get_width',\
+    pixman_image_ref,'pixman_image_ref',\
+    pixman_image_set_accessors,'pixman_image_set_accessors',\
+    pixman_image_set_alpha_map,'pixman_image_set_alpha_map',\
+    pixman_image_set_clip_region,'pixman_image_set_clip_region',\
+    pixman_image_set_clip_region32,'pixman_image_set_clip_region32',\
+    pixman_image_set_component_alpha,'pixman_image_set_component_alpha',\
+    pixman_image_set_destroy_function,'pixman_image_set_destroy_function',\
+    pixman_image_set_filter,'pixman_image_set_filter',\
+    pixman_image_set_has_client_clip,'pixman_image_set_has_client_clip',\
+    pixman_image_set_indexed,'pixman_image_set_indexed',\
+    pixman_image_set_repeat,'pixman_image_set_repeat',\
+    pixman_image_set_source_clipping,'pixman_image_set_source_clipping',\
+    pixman_image_set_transform,'pixman_image_set_transform',\
+    pixman_image_unref,'pixman_image_unref',\
+    pixman_line_fixed_edge_init,'pixman_line_fixed_edge_init',\
+    pixman_rasterize_edges,'pixman_rasterize_edges',\
+    pixman_rasterize_trapezoid,'pixman_rasterize_trapezoid',\
+    pixman_region32_clear,'pixman_region32_clear',\
+    pixman_region32_contains_point,'pixman_region32_contains_point',\
+    pixman_region32_contains_rectangle,'pixman_region32_contains_rectangle',\
+    pixman_region32_copy,'pixman_region32_copy',\
+    pixman_region32_equal,'pixman_region32_equal',\
+    pixman_region32_extents,'pixman_region32_extents',\
+    pixman_region32_fini,'pixman_region32_fini',\
+    pixman_region32_init,'pixman_region32_init',\
+    pixman_region32_init_from_image,'pixman_region32_init_from_image',\
+    pixman_region32_init_rect,'pixman_region32_init_rect',\
+    pixman_region32_init_rects,'pixman_region32_init_rects',\
+    pixman_region32_init_with_extents,'pixman_region32_init_with_extents',\
+    pixman_region32_intersect,'pixman_region32_intersect',\
+    pixman_region32_intersect_rect,'pixman_region32_intersect_rect',\
+    pixman_region32_inverse,'pixman_region32_inverse',\
+    pixman_region32_n_rects,'pixman_region32_n_rects',\
+    pixman_region32_not_empty,'pixman_region32_not_empty',\
+    pixman_region32_rectangles,'pixman_region32_rectangles',\
+    pixman_region32_reset,'pixman_region32_reset',\
+    pixman_region32_selfcheck,'pixman_region32_selfcheck',\
+    pixman_region32_subtract,'pixman_region32_subtract',\
+    pixman_region32_translate,'pixman_region32_translate',\
+    pixman_region32_union,'pixman_region32_union',\
+    pixman_region32_union_rect,'pixman_region32_union_rect',\
+    pixman_region_clear,'pixman_region_clear',\
+    pixman_region_contains_point,'pixman_region_contains_point',\
+    pixman_region_contains_rectangle,'pixman_region_contains_rectangle',\
+    pixman_region_copy,'pixman_region_copy',\
+    pixman_region_equal,'pixman_region_equal',\
+    pixman_region_extents,'pixman_region_extents',\
+    pixman_region_fini,'pixman_region_fini',\
+    pixman_region_init,'pixman_region_init',\
+    pixman_region_init_from_image,'pixman_region_init_from_image',\
+    pixman_region_init_rect,'pixman_region_init_rect',\
+    pixman_region_init_rects,'pixman_region_init_rects',\
+    pixman_region_init_with_extents,'pixman_region_init_with_extents',\
+    pixman_region_intersect,'pixman_region_intersect',\
+    pixman_region_intersect_rect,'pixman_region_intersect_rect',\
+    pixman_region_inverse,'pixman_region_inverse',\
+    pixman_region_n_rects,'pixman_region_n_rects',\
+    pixman_region_not_empty,'pixman_region_not_empty',\
+    pixman_region_rectangles,'pixman_region_rectangles',\
+    pixman_region_reset,'pixman_region_reset',\
+    pixman_region_selfcheck,'pixman_region_selfcheck',\
+    pixman_region_set_static_pointers,'pixman_region_set_static_pointers',\
+    pixman_region_subtract,'pixman_region_subtract',\
+    pixman_region_translate,'pixman_region_translate',\
+    pixman_region_union,'pixman_region_union',\
+    pixman_region_union_rect,'pixman_region_union_rect',\
+    pixman_sample_ceil_y,'pixman_sample_ceil_y',\
+    pixman_sample_floor_y,'pixman_sample_floor_y',\
+    pixman_transform_bounds,'pixman_transform_bounds',\
+    pixman_transform_from_pixman_f_transform,'pixman_transform_from_pixman_f_transform',\
+    pixman_transform_init_identity,'pixman_transform_init_identity',\
+    pixman_transform_init_rotate,'pixman_transform_init_rotate',\
+    pixman_transform_init_scale,'pixman_transform_init_scale',\
+    pixman_transform_init_translate,'pixman_transform_init_translate',\
+    pixman_transform_invert,'pixman_transform_invert',\
+    pixman_transform_is_identity,'pixman_transform_is_identity',\
+    pixman_transform_is_int_translate,'pixman_transform_is_int_translate',\
+    pixman_transform_is_inverse,'pixman_transform_is_inverse',\
+    pixman_transform_is_scale,'pixman_transform_is_scale',\
+    pixman_transform_multiply,'pixman_transform_multiply',\
+    pixman_transform_point,'pixman_transform_point',\
+    pixman_transform_point_31_16,'pixman_transform_point_31_16',\
+    pixman_transform_point_31_16_3d,'pixman_transform_point_31_16_3d',\
+    pixman_transform_point_31_16_affine,'pixman_transform_point_31_16_affine',\
+    pixman_transform_point_3d,'pixman_transform_point_3d',\
+    pixman_transform_rotate,'pixman_transform_rotate',\
+    pixman_transform_scale,'pixman_transform_scale',\
+    pixman_transform_translate,'pixman_transform_translate',\
+    pixman_version,'pixman_version',\
+    pixman_version_string,'pixman_version_string'
diff --git a/programs/develop/libraries/pixman/COPYING b/programs/develop/libraries/pixman/COPYING
new file mode 100644
index 0000000000..6168dea56f
--- /dev/null
+++ b/programs/develop/libraries/pixman/COPYING
@@ -0,0 +1,42 @@
+The following is the MIT license, agreed upon by most contributors.
+Copyright holders of new code should use this license statement where
+possible. They may also add themselves to the list below.
+
+/*
+ * Copyright 1987, 1988, 1989, 1998  The Open Group
+ * Copyright 1987, 1988, 1989 Digital Equipment Corporation
+ * Copyright 1999, 2004, 2008 Keith Packard
+ * Copyright 2000 SuSE, Inc.
+ * Copyright 2000 Keith Packard, member of The XFree86 Project, Inc.
+ * Copyright 2004, 2005, 2007, 2008, 2009, 2010 Red Hat, Inc.
+ * Copyright 2004 Nicholas Miell
+ * Copyright 2005 Lars Knoll & Zack Rusin, Trolltech
+ * Copyright 2005 Trolltech AS
+ * Copyright 2007 Luca Barbato
+ * Copyright 2008 Aaron Plattner, NVIDIA Corporation
+ * Copyright 2008 Rodrigo Kumpera
+ * Copyright 2008 André Tupinambá
+ * Copyright 2008 Mozilla Corporation
+ * Copyright 2008 Frederic Plourde
+ * Copyright 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2009, 2010 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
diff --git a/programs/develop/libraries/pixman/Makefile b/programs/develop/libraries/pixman/Makefile
index 36501a32f0..c063741e92 100644
--- a/programs/develop/libraries/pixman/Makefile
+++ b/programs/develop/libraries/pixman/Makefile
@@ -2,45 +2,52 @@
 LIBRARY = pixman-1
 
 CC = gcc
+CFLAGS = -U_Win32 -U_WIN32 -U__MINGW32__ -c -O2 -Wall -Winline -fomit-frame-pointer
 
-CFLAGS = -c -O2 -mmmx -Winline -fomit-frame-pointer
+LD = ld
+LDFLAGS = -shared -s -nostdlib -T ../newlib/dll.lds --entry _DllStartup --image-base=0 --out-implib $(LIBRARY).dll.a
 
-LDIMPORT:= -nostdlib --out-implib libpiximp.a --exclude-libs libamz.a
-LDFLAGS:=  -shared -s -T ../newlib/dll.lds --image-base 0
+STRIP = $(PREFIX)strip
 
-DEFINES = -DHAVE_CONFIG_H -DPIXMAN_NO_TLS -DUSE_MMX
-
-INCLUDES = -I../pixman -I../newlib/include
+INCLUDES= -I. -I../newlib/include
 
 LIBPATH:= -L../newlib
 
-LIBS:=  -lamz -lgcc -lcimp
+LIBS:=  -ldll -lc.dll -lgcc
+
+DEFINES = -DHAVE_CONFIG_H
+
 
 SOURCES =				\
-	pixman-image.c			\
+	pixman.c			\
 	pixman-access.c			\
 	pixman-access-accessors.c	\
-	pixman-region16.c		\
-	pixman-region32.c		\
+	pixman-bits-image.c		\
 	pixman-combine32.c		\
-	pixman-combine64.c		\
-	pixman-utils.c			\
+	pixman-combine-float.c		\
+	pixman-conical-gradient.c	\
 	pixman-edge.c			\
 	pixman-edge-accessors.c		\
-	pixman-trap.c			\
-	pixman-timer.c			\
-	pixman-matrix.c			\
-	pixman-gradient-walker.c	\
-	pixman-linear-gradient.c	\
-	pixman-radial-gradient.c	\
-	pixman-bits-image.c		\
-	pixman.c			\
-	pixman-cpu.c			\
 	pixman-fast-path.c		\
-	pixman-implementation.c		\
-	pixman-solid-fill.c		\
+	pixman-filter.c			\
 	pixman-general.c		\
+	pixman-glyph.c			\
+	pixman-gradient-walker.c	\
+	pixman-image.c			\
+	pixman-implementation.c		\
+	pixman-linear-gradient.c	\
+	pixman-matrix.c			\
+	pixman-noop.c			\
+	pixman-radial-gradient.c	\
+	pixman-region16.c		\
+	pixman-region32.c		\
+	pixman-solid-fill.c		\
+	pixman-timer.c			\
+	pixman-trap.c			\
+	pixman-utils.c			\
+	pixman-x86.c			\
 	pixman-mmx.c			\
+	pixman-sse2.c			\
 	$(NULL)
  
 OBJECTS     = $(patsubst %.c, %.o, $(SOURCES))
@@ -53,13 +60,21 @@ all:$(LIBRARY).a $(LIBRARY).dll
 $(LIBRARY).a: $(OBJECTS) Makefile
 	ar cvrs $(LIBRARY).a $(OBJECTS)
 
-$(LIBRARY).dll: $(OBJECTS) Makefile
-	ld $(LDFLAGS) $(LDIMPORT) $(LIBPATH) -o $@ $(OBJECTS) $(LIBS) 
+$(LIBRARY).dll: $(LIBRARY).def $(OBJECTS) Makefile
+	$(LD) $(LDFLAGS) $(LIBPATH) -o $@ $(LIBRARY).def $(OBJECTS) $(LIBS)
+	$(STRIP) $@
+	sed -f ../newlib/cmd1.sed $(LIBRARY).def > mem
+	sed -f ../newlib/cmd2.sed mem >$(LIBRARY).inc
 
-
-%.o: %.c $(SOURCES) Makefile
+%.o : %.c Makefile
 	$(CC) $(CFLAGS) $(DEFINES) $(INCLUDES) -o $@ $<
 
+pixman-mmx.o: pixman-mmx.c Makefile
+	$(CC) $(CFLAGS) -mmmx $(DEFINES) $(INCLUDES) -o $@ $<
+
+pixman-sse2.o: pixman-sse2.c Makefile
+	$(CC) $(CFLAGS) -msse2 $(DEFINES) $(INCLUDES) -o $@ $<
+
 
 clean: 
 	-rm -f *.o 
diff --git a/programs/develop/libraries/pixman/README b/programs/develop/libraries/pixman/README
index 3cfbc5053e..6d8cfd8ad5 100644
--- a/programs/develop/libraries/pixman/README
+++ b/programs/develop/libraries/pixman/README
@@ -1,22 +1,116 @@
-pixman is a library that provides low-level pixel manipulation
+Pixman is a library that provides low-level pixel manipulation
 features such as image compositing and trapezoid rasterization.
 
-All questions regarding this software should be directed to the pixman
+Questions, bug reports and patches should be directed to the pixman
 mailing list:
 
         http://lists.freedesktop.org/mailman/listinfo/pixman
 
-Please send patches and bug reports either to the mailing list above,
-or file them at the freedesktop bug tracker:
+You can also file bugs at
 
         https://bugs.freedesktop.org/enter_bug.cgi?product=pixman
 
-The master development code repository can be found at:
+For real time discussions about pixman, feel free to join the IRC
+channels #cairo and #xorg-devel on the FreeNode IRC network.
+
+
+Contributing
+------------
+
+In order to contribute to pixman, you will need a working knowledge of
+the git version control system. For a quick getting started guide,
+there is the "Everyday Git With 20 Commands Or So guide"
+
+        http://www.kernel.org/pub/software/scm/git/docs/everyday.html
+
+from the Git homepage. For more in depth git documentation, see the
+resources on the Git community documentation page:
+
+        http://git-scm.com/documentation
+
+Pixman uses the infrastructure from the freedesktop.org umbrella
+project. For instructions about how to use the git service on
+freedesktop.org, see:
+
+        http://www.freedesktop.org/wiki/Infrastructure/git/Developers
+
+The Pixman master repository can be found at:
 
 	git://anongit.freedesktop.org/git/pixman
 
-	http://gitweb.freedesktop.org/?p=pixman;a=summary
+and browsed on the web here:
 
-For more information on the git code manager, see:
+	http://cgit.freedesktop.org/pixman/
 
-	http://wiki.x.org/wiki/GitPage
+
+Sending patches
+---------------
+
+The general workflow for sending patches is to first make sure that
+git can send mail on your system. Then, 
+
+ - create a branch off of master in your local git repository
+
+ - make your changes as one or more commits
+
+ - use the 
+
+        git send-email
+
+   command to send the patch series to pixman@lists.freedesktop.org.
+
+In order for your patches to be accepted, please consider the
+following guidelines:
+
+ - This link:
+
+        http://www.kernel.org/pub/software/scm/git/docs/user-manual.html#patch-series
+
+   describes how what a good patch series is, and to create one with
+   git.
+
+ - At each point in the series, pixman should compile and the test
+   suite should pass.
+
+   The exception here is if you are changing the test suite to
+   demonstrate a bug. In this case, make one commit that makes the
+   test suite fail due to the bug, and then another commit that fixes
+   the bug.
+
+   You can run the test suite with 
+
+        make check
+
+   It will take around two minutes to run on a modern PC.
+
+ - Follow the coding style described in the CODING_STYLE file
+
+ - For bug fixes, include an update to the test suite to make sure
+   the bug doesn't reappear.
+
+ - For new features, add tests of the feature to the test
+   suite. Also, add a program demonstrating the new feature to the
+   demos/ directory.
+
+ - Write descriptive commit messages. Useful information to include:
+        - Benchmark results, before and after
+	- Description of the bug that was fixed
+	- Detailed rationale for any new API
+	- Alternative approaches that were rejected (and why they
+          don't work)
+	- If review comments were incorporated, a brief version
+          history describing what those changes were.
+
+ - For big patch series, send an introductory email with an overall
+   description of the patch series, including benchmarks and
+   motivation. Each commit message should still be descriptive and
+   include enough information to understand why this particular commit
+   was necessary.
+
+Pixman has high standards for code quality and so almost everybody
+should expect to have the first versions of their patches rejected.
+
+If you think that the reviewers are wrong about something, or that the
+guidelines above are wrong, feel free to discuss the issue on the
+list. The purpose of the guidelines and code review is to ensure high
+code quality; it is not an exercise in compliance.
diff --git a/programs/develop/libraries/pixman/config.h b/programs/develop/libraries/pixman/config.h
index c57fa62ed3..d7651c76a9 100644
--- a/programs/develop/libraries/pixman/config.h
+++ b/programs/develop/libraries/pixman/config.h
@@ -10,6 +10,15 @@
 /* Define to 1 if you have the <dlfcn.h> header file. */
 /* #undef HAVE_DLFCN_H */
 
+/* Whether we have feenableexcept() */
+/* #undef HAVE_FEENABLEEXCEPT */
+
+/* Define to 1 if we have <fenv.h> */
+#define HAVE_FENV_H 1
+
+/* Whether the tool chain supports __float128 */
+#define HAVE_FLOAT128 /**/
+
 /* Define to 1 if you have the `getisax' function. */
 /* #undef HAVE_GETISAX */
 
@@ -25,9 +34,15 @@
 /* Define to 1 if you have the `pixman-1' library (-lpixman-1). */
 /* #undef HAVE_LIBPIXMAN_1 */
 
+/* Whether we have libpng */
+/* #undef HAVE_LIBPNG */
+
 /* Define to 1 if you have the <memory.h> header file. */
 #define HAVE_MEMORY_H 1
 
+/* Whether we have mmap() */
+#define HAVE_MMAP
+
 /* Whether we have mprotect() */
 #define HAVE_MPROTECT 1
 
@@ -72,13 +87,13 @@
 #define PACKAGE "pixman"
 
 /* Define to the address where bug reports for this package should be sent. */
-#define PACKAGE_BUGREPORT ""pixman@lists.freedesktop.org""
+#define PACKAGE_BUGREPORT "pixman@lists.freedesktop.org"
 
 /* Define to the full name of this package. */
 #define PACKAGE_NAME "pixman"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "pixman 0.20.2"
+#define PACKAGE_STRING "pixman 0.30.2"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "pixman"
@@ -87,7 +102,7 @@
 #define PACKAGE_URL ""
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "0.20.2"
+#define PACKAGE_VERSION "0.30.2"
 
 /* enable TIMER_BEGIN/TIMER_END macros */
 /* #undef PIXMAN_TIMERS */
@@ -98,8 +113,14 @@
 /* Define to 1 if you have the ANSI C header files. */
 #define STDC_HEADERS 1
 
-/* Whether the tool chain supports __thread */
-//#define TOOLCHAIN_SUPPORTS__THREAD /**/
+/* The compiler supported TLS storage class */
+#define TLS __thread
+
+/* Whether the tool chain supports __attribute__((constructor)) */
+#define TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR /**/
+
+/* use ARM IWMMXT compiler intrinsics */
+/* #undef USE_ARM_IWMMXT */
 
 /* use ARM NEON assembly optimizations */
 /* #undef USE_ARM_NEON */
@@ -110,20 +131,26 @@
 /* use GNU-style inline assembler */
 #define USE_GCC_INLINE_ASM 1
 
-/* use MMX compiler intrinsics */
-#define USE_MMX 1
+/* use Loongson Multimedia Instructions */
+/* #undef USE_LOONGSON_MMI */
+
+/* use MIPS DSPr2 assembly optimizations */
+/* #undef USE_MIPS_DSPR2 */
 
 /* use OpenMP in the test suite */
-//#define USE_OPENMP 1
+/* #undef USE_OPENMP */
 
 /* use SSE2 compiler intrinsics */
-//#define USE_SSE2 1
+#define USE_SSE2 1
 
 /* use VMX compiler intrinsics */
 /* #undef USE_VMX */
 
+/* use x86 MMX compiler intrinsics */
+#define USE_X86_MMX 1
+
 /* Version number of package */
-#define VERSION "0.20.2"
+#define VERSION "0.30.2"
 
 /* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
    significant byte first (like Motorola and SPARC, unlike Intel). */
@@ -142,3 +169,6 @@
 #ifndef __cplusplus
 /* #undef inline */
 #endif
+
+/* Define to sqrt if you do not have the `sqrtf' function. */
+/* #undef sqrtf */
diff --git a/programs/develop/libraries/pixman/pixman-1.def b/programs/develop/libraries/pixman/pixman-1.def
new file mode 100644
index 0000000000..ee57353797
--- /dev/null
+++ b/programs/develop/libraries/pixman/pixman-1.def
@@ -0,0 +1,148 @@
+EXPORTS
+    _pixman_internal_only_get_implementation
+    pixman_add_trapezoids
+    pixman_add_traps
+    pixman_add_triangles
+    pixman_blt
+    pixman_composite_glyphs
+    pixman_composite_glyphs_no_mask
+    pixman_composite_trapezoids
+    pixman_composite_triangles
+    pixman_compute_composite_region
+    pixman_disable_out_of_bounds_workaround
+    pixman_edge_init
+    pixman_edge_step
+    pixman_f_transform_bounds
+    pixman_f_transform_from_pixman_transform
+    pixman_f_transform_init_identity
+    pixman_f_transform_init_rotate
+    pixman_f_transform_init_scale
+    pixman_f_transform_init_translate
+    pixman_f_transform_invert
+    pixman_f_transform_multiply
+    pixman_f_transform_point
+    pixman_f_transform_point_3d
+    pixman_f_transform_rotate
+    pixman_f_transform_scale
+    pixman_f_transform_translate
+    pixman_fill
+    pixman_filter_create_separable_convolution
+    pixman_format_supported_destination
+    pixman_format_supported_source
+    pixman_glyph_cache_create
+    pixman_glyph_cache_destroy
+    pixman_glyph_cache_freeze
+    pixman_glyph_cache_insert
+    pixman_glyph_cache_lookup
+    pixman_glyph_cache_remove
+    pixman_glyph_cache_thaw
+    pixman_glyph_get_extents
+    pixman_glyph_get_mask_format
+    pixman_image_composite
+    pixman_image_composite32
+    pixman_image_create_bits
+    pixman_image_create_bits_no_clear
+    pixman_image_create_conical_gradient
+    pixman_image_create_linear_gradient
+    pixman_image_create_radial_gradient
+    pixman_image_create_solid_fill
+    pixman_image_fill_boxes
+    pixman_image_fill_rectangles
+    pixman_image_get_component_alpha
+    pixman_image_get_data
+    pixman_image_get_depth
+    pixman_image_get_destroy_data
+    pixman_image_get_format
+    pixman_image_get_height
+    pixman_image_get_stride
+    pixman_image_get_width
+    pixman_image_ref
+    pixman_image_set_accessors
+    pixman_image_set_alpha_map
+    pixman_image_set_clip_region
+    pixman_image_set_clip_region32
+    pixman_image_set_component_alpha
+    pixman_image_set_destroy_function
+    pixman_image_set_filter
+    pixman_image_set_has_client_clip
+    pixman_image_set_indexed
+    pixman_image_set_repeat
+    pixman_image_set_source_clipping
+    pixman_image_set_transform
+    pixman_image_unref
+    pixman_line_fixed_edge_init
+    pixman_rasterize_edges
+    pixman_rasterize_trapezoid
+    pixman_region32_clear
+    pixman_region32_contains_point
+    pixman_region32_contains_rectangle
+    pixman_region32_copy
+    pixman_region32_equal
+    pixman_region32_extents
+    pixman_region32_fini
+    pixman_region32_init
+    pixman_region32_init_from_image
+    pixman_region32_init_rect
+    pixman_region32_init_rects
+    pixman_region32_init_with_extents
+    pixman_region32_intersect
+    pixman_region32_intersect_rect
+    pixman_region32_inverse
+    pixman_region32_n_rects
+    pixman_region32_not_empty
+    pixman_region32_rectangles
+    pixman_region32_reset
+    pixman_region32_selfcheck
+    pixman_region32_subtract
+    pixman_region32_translate
+    pixman_region32_union
+    pixman_region32_union_rect
+    pixman_region_clear
+    pixman_region_contains_point
+    pixman_region_contains_rectangle
+    pixman_region_copy
+    pixman_region_equal
+    pixman_region_extents
+    pixman_region_fini
+    pixman_region_init
+    pixman_region_init_from_image
+    pixman_region_init_rect
+    pixman_region_init_rects
+    pixman_region_init_with_extents
+    pixman_region_intersect
+    pixman_region_intersect_rect
+    pixman_region_inverse
+    pixman_region_n_rects
+    pixman_region_not_empty
+    pixman_region_rectangles
+    pixman_region_reset
+    pixman_region_selfcheck
+    pixman_region_set_static_pointers
+    pixman_region_subtract
+    pixman_region_translate
+    pixman_region_union
+    pixman_region_union_rect
+    pixman_sample_ceil_y
+    pixman_sample_floor_y
+    pixman_transform_bounds
+    pixman_transform_from_pixman_f_transform
+    pixman_transform_init_identity
+    pixman_transform_init_rotate
+    pixman_transform_init_scale
+    pixman_transform_init_translate
+    pixman_transform_invert
+    pixman_transform_is_identity
+    pixman_transform_is_int_translate
+    pixman_transform_is_inverse
+    pixman_transform_is_scale
+    pixman_transform_multiply
+    pixman_transform_point
+    pixman_transform_point_31_16
+    pixman_transform_point_31_16_3d
+    pixman_transform_point_31_16_affine
+    pixman_transform_point_3d
+    pixman_transform_rotate
+    pixman_transform_scale
+    pixman_transform_translate
+    pixman_version
+    pixman_version_string
diff --git a/programs/develop/libraries/pixman/pixman-access.c b/programs/develop/libraries/pixman/pixman-access.c
index f1ce0ba400..b5c8e4017a 100644
--- a/programs/develop/libraries/pixman/pixman-access.c
+++ b/programs/develop/libraries/pixman/pixman-access.c
@@ -31,9 +31,10 @@
 #include <stdlib.h>
 #include <string.h>
 #include <assert.h>
+#include <math.h>
 
-#include "pixman-private.h"
 #include "pixman-accessor.h"
+#include "pixman-private.h"
 
 #define CONVERT_RGB24_TO_Y15(s)						\
     (((((s) >> 16) & 0xff) * 153 +					\
@@ -45,14 +46,119 @@
      (((s) >> 6) & 0x03e0) |                                            \
      (((s) >> 9) & 0x7c00))
 
-#define RGB15_TO_ENTRY(mif,rgb15)					\
-    ((mif)->ent[rgb15])
+/* Fetch macros */
 
-#define RGB24_TO_ENTRY(mif,rgb24)					\
-    RGB15_TO_ENTRY (mif,CONVERT_RGB24_TO_RGB15 (rgb24))
+#ifdef WORDS_BIGENDIAN
+#define FETCH_1(img,l,o)						\
+    (((READ ((img), ((uint32_t *)(l)) + ((o) >> 5))) >> (0x1f - ((o) & 0x1f))) & 0x1)
+#else
+#define FETCH_1(img,l,o)						\
+    ((((READ ((img), ((uint32_t *)(l)) + ((o) >> 5))) >> ((o) & 0x1f))) & 0x1)
+#endif
 
-#define RGB24_TO_ENTRY_Y(mif,rgb24)					\
-    ((mif)->ent[CONVERT_RGB24_TO_Y15 (rgb24)])
+#define FETCH_8(img,l,o)    (READ (img, (((uint8_t *)(l)) + ((o) >> 3))))
+
+#ifdef WORDS_BIGENDIAN
+#define FETCH_4(img,l,o)						\
+    (((4 * (o)) & 4) ? (FETCH_8 (img,l, 4 * (o)) & 0xf) : (FETCH_8 (img,l,(4 * (o))) >> 4))
+#else
+#define FETCH_4(img,l,o)						\
+    (((4 * (o)) & 4) ? (FETCH_8 (img, l, 4 * (o)) >> 4) : (FETCH_8 (img, l, (4 * (o))) & 0xf))
+#endif
+
+#ifdef WORDS_BIGENDIAN
+#define FETCH_24(img,l,o)                                              \
+    ((READ (img, (((uint8_t *)(l)) + ((o) * 3) + 0)) << 16)    |       \
+     (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 1)) << 8)     |       \
+     (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 2)) << 0))
+#else
+#define FETCH_24(img,l,o)						\
+    ((READ (img, (((uint8_t *)(l)) + ((o) * 3) + 0)) << 0)	|	\
+     (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 1)) << 8)	|	\
+     (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 2)) << 16))
+#endif
+
+/* Store macros */
+
+#ifdef WORDS_BIGENDIAN
+#define STORE_1(img,l,o,v)						\
+    do									\
+    {									\
+	uint32_t  *__d = ((uint32_t *)(l)) + ((o) >> 5);		\
+	uint32_t __m, __v;						\
+									\
+	__m = 1 << (0x1f - ((o) & 0x1f));				\
+	__v = (v)? __m : 0;						\
+									\
+	WRITE((img), __d, (READ((img), __d) & ~__m) | __v);		\
+    }									\
+    while (0)
+#else
+#define STORE_1(img,l,o,v)						\
+    do									\
+    {									\
+	uint32_t  *__d = ((uint32_t *)(l)) + ((o) >> 5);		\
+	uint32_t __m, __v;						\
+									\
+	__m = 1 << ((o) & 0x1f);					\
+	__v = (v)? __m : 0;						\
+									\
+	WRITE((img), __d, (READ((img), __d) & ~__m) | __v);		\
+    }									\
+    while (0)
+#endif
+
+#define STORE_8(img,l,o,v)  (WRITE (img, (uint8_t *)(l) + ((o) >> 3), (v)))
+
+#ifdef WORDS_BIGENDIAN
+#define STORE_4(img,l,o,v)						\
+    do									\
+    {									\
+	int bo = 4 * (o);						\
+	int v4 = (v) & 0x0f;						\
+									\
+	STORE_8 (img, l, bo, (						\
+		     bo & 4 ?						\
+		     (FETCH_8 (img, l, bo) & 0xf0) | (v4) :		\
+		     (FETCH_8 (img, l, bo) & 0x0f) | (v4 << 4)));	\
+    } while (0)
+#else
+#define STORE_4(img,l,o,v)						\
+    do									\
+    {									\
+	int bo = 4 * (o);						\
+	int v4 = (v) & 0x0f;						\
+									\
+	STORE_8 (img, l, bo, (						\
+		     bo & 4 ?						\
+		     (FETCH_8 (img, l, bo) & 0x0f) | (v4 << 4) :	\
+		     (FETCH_8 (img, l, bo) & 0xf0) | (v4)));		\
+    } while (0)
+#endif
+
+#ifdef WORDS_BIGENDIAN
+#define STORE_24(img,l,o,v)                                            \
+    do                                                                 \
+    {                                                                  \
+	uint8_t *__tmp = (l) + 3 * (o);				       \
+        							       \
+	WRITE ((img), __tmp++, ((v) & 0x00ff0000) >> 16);	       \
+	WRITE ((img), __tmp++, ((v) & 0x0000ff00) >>  8);	       \
+	WRITE ((img), __tmp++, ((v) & 0x000000ff) >>  0);	       \
+    }                                                                  \
+    while (0)
+#else
+#define STORE_24(img,l,o,v)                                            \
+    do                                                                 \
+    {                                                                  \
+	uint8_t *__tmp = (l) + 3 * (o);				       \
+        							       \
+	WRITE ((img), __tmp++, ((v) & 0x000000ff) >>  0);	       \
+	WRITE ((img), __tmp++, ((v) & 0x0000ff00) >>  8);	       \
+	WRITE ((img), __tmp++, ((v) & 0x00ff0000) >> 16);	       \
+    }								       \
+    while (0)
+#endif
 
 /*
  * YV12 setup and access macros
@@ -86,972 +192,543 @@
     ((uint8_t *) ((bits) + offset0 +                                    \
                   ((stride) >> 1) * ((line) >> 1)))
 
-/********************************** Fetch ************************************/
+/* Misc. helpers */
 
-static void
-fetch_scanline_a8r8g8b8 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
+static force_inline void
+get_shifts (pixman_format_code_t  format,
+	    int			 *a,
+	    int			 *r,
+	    int                  *g,
+	    int                  *b)
 {
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    
-    MEMCPY_WRAPPED (image,
-                    buffer, (const uint32_t *)bits + x,
-                    width * sizeof(uint32_t));
-}
-
-static void
-fetch_scanline_x8r8g8b8 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint32_t *pixel = (const uint32_t *)bits + x;
-    const uint32_t *end = pixel + width;
-    
-    while (pixel < end)
-	*buffer++ = READ (image, pixel++) | 0xff000000;
-}
-
-static void
-fetch_scanline_a8b8g8r8 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint32_t *pixel = (uint32_t *)bits + x;
-    const uint32_t *end = pixel + width;
-    
-    while (pixel < end)
+    switch (PIXMAN_FORMAT_TYPE (format))
     {
-	uint32_t p = READ (image, pixel++);
-	
-	*buffer++ = (p & 0xff00ff00)	|
-	    ((p >> 16) & 0xff)		|
-	    ((p & 0xff) << 16);
+    case PIXMAN_TYPE_A:
+	*b = 0;
+	*g = 0;
+	*r = 0;
+	*a = 0;
+	break;
+
+    case PIXMAN_TYPE_ARGB:
+    case PIXMAN_TYPE_ARGB_SRGB:
+	*b = 0;
+	*g = *b + PIXMAN_FORMAT_B (format);
+	*r = *g + PIXMAN_FORMAT_G (format);
+	*a = *r + PIXMAN_FORMAT_R (format);
+	break;
+
+    case PIXMAN_TYPE_ABGR:
+	*r = 0;
+	*g = *r + PIXMAN_FORMAT_R (format);
+	*b = *g + PIXMAN_FORMAT_G (format);
+	*a = *b + PIXMAN_FORMAT_B (format);
+	break;
+
+    case PIXMAN_TYPE_BGRA:
+	/* With BGRA formats we start counting at the high end of the pixel */
+	*b = PIXMAN_FORMAT_BPP (format) - PIXMAN_FORMAT_B (format);
+	*g = *b - PIXMAN_FORMAT_B (format);
+	*r = *g - PIXMAN_FORMAT_G (format);
+	*a = *r - PIXMAN_FORMAT_R (format);
+	break;
+
+    case PIXMAN_TYPE_RGBA:
+	/* With BGRA formats we start counting at the high end of the pixel */
+	*r = PIXMAN_FORMAT_BPP (format) - PIXMAN_FORMAT_R (format);
+	*g = *r - PIXMAN_FORMAT_R (format);
+	*b = *g - PIXMAN_FORMAT_G (format);
+	*a = *b - PIXMAN_FORMAT_B (format);
+	break;
+
+    default:
+	assert (0);
+	break;
     }
 }
 
-static void
-fetch_scanline_x8b8g8r8 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
+static force_inline uint32_t
+convert_channel (uint32_t pixel, uint32_t def_value,
+		 int n_from_bits, int from_shift,
+		 int n_to_bits, int to_shift)
 {
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint32_t *pixel = (uint32_t *)bits + x;
-    const uint32_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	
-	*buffer++ = 0xff000000		|
-	    (p & 0x0000ff00)		|
-	    ((p >> 16) & 0xff)		|
-	    ((p & 0xff) << 16);
-    }
+    uint32_t v;
+
+    if (n_from_bits && n_to_bits)
+	v  = unorm_to_unorm (pixel >> from_shift, n_from_bits, n_to_bits);
+    else if (n_to_bits)
+	v = def_value;
+    else
+	v = 0;
+
+    return (v & ((1 << n_to_bits) - 1)) << to_shift;
 }
 
-static void
-fetch_scanline_b8g8r8a8 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint32_t *pixel = (uint32_t *)bits + x;
-    const uint32_t *end = pixel + width;
-
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-
-	*buffer++ = (((p & 0xff000000) >> 24)	|
-	             ((p & 0x00ff0000) >> 8)	|
-	             ((p & 0x0000ff00) << 8)	|
-	             ((p & 0x000000ff) << 24));
-    }
-}
-
-static void
-fetch_scanline_b8g8r8x8 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint32_t *pixel = (uint32_t *)bits + x;
-    const uint32_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	
-	*buffer++ = (0xff000000 |
-	             ((p & 0xff000000) >> 24)	|
-	             ((p & 0x00ff0000) >> 8)	|
-	             ((p & 0x0000ff00) << 8));
-    }
-}
-
-static void
-fetch_scanline_x14r6g6b6 (pixman_image_t *image,
-                          int             x,
-                          int             y,
-                          int             width,
-                          uint32_t *      buffer,
-                          const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint32_t *pixel = (const uint32_t *)bits + x;
-    const uint32_t *end = pixel + width;
-
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t r, g, b;
-
-	r = ((p & 0x3f000) << 6) | ((p & 0x30000));
-	g = ((p & 0x00fc0) << 4) | ((p & 0x00c00) >> 2);
-	b = ((p & 0x0003f) << 2) | ((p & 0x00030) >> 4);
-
-	*buffer++ = 0xff000000 | r | g | b;
-    }
-}
-
-/* Expects a uint64_t buffer */
-static void
-fetch_scanline_a2r10g10b10 (pixman_image_t *image,
-                            int             x,
-                            int             y,
-                            int             width,
-                            uint32_t *      b,
-                            const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint32_t *pixel = bits + x;
-    const uint32_t *end = pixel + width;
-    uint64_t *buffer = (uint64_t *)b;
-
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint64_t a = p >> 30;
-	uint64_t r = (p >> 20) & 0x3ff;
-	uint64_t g = (p >> 10) & 0x3ff;
-	uint64_t b = p & 0x3ff;
-
-	r = r << 6 | r >> 4;
-	g = g << 6 | g >> 4;
-	b = b << 6 | b >> 4;
-
-	a <<= 14;
-	a |= a >> 2;
-	a |= a >> 4;
-	a |= a >> 8;
-
-	*buffer++ = a << 48 | r << 32 | g << 16 | b;
-    }
-}
-
-/* Expects a uint64_t buffer */
-static void
-fetch_scanline_x2r10g10b10 (pixman_image_t *image,
-                            int             x,
-                            int             y,
-                            int             width,
-                            uint32_t *      b,
-                            const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint32_t *pixel = (uint32_t *)bits + x;
-    const uint32_t *end = pixel + width;
-    uint64_t *buffer = (uint64_t *)b;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint64_t r = (p >> 20) & 0x3ff;
-	uint64_t g = (p >> 10) & 0x3ff;
-	uint64_t b = p & 0x3ff;
-	
-	r = r << 6 | r >> 4;
-	g = g << 6 | g >> 4;
-	b = b << 6 | b >> 4;
-	
-	*buffer++ = 0xffffULL << 48 | r << 32 | g << 16 | b;
-    }
-}
-
-/* Expects a uint64_t buffer */
-static void
-fetch_scanline_a2b10g10r10 (pixman_image_t *image,
-                            int             x,
-                            int             y,
-                            int             width,
-                            uint32_t *      b,
-                            const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint32_t *pixel = bits + x;
-    const uint32_t *end = pixel + width;
-    uint64_t *buffer = (uint64_t *)b;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint64_t a = p >> 30;
-	uint64_t b = (p >> 20) & 0x3ff;
-	uint64_t g = (p >> 10) & 0x3ff;
-	uint64_t r = p & 0x3ff;
-	
-	r = r << 6 | r >> 4;
-	g = g << 6 | g >> 4;
-	b = b << 6 | b >> 4;
-	
-	a <<= 14;
-	a |= a >> 2;
-	a |= a >> 4;
-	a |= a >> 8;
-
-	*buffer++ = a << 48 | r << 32 | g << 16 | b;
-    }
-}
-
-/* Expects a uint64_t buffer */
-static void
-fetch_scanline_x2b10g10r10 (pixman_image_t *image,
-                            int             x,
-                            int             y,
-                            int             width,
-                            uint32_t *      b,
-                            const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint32_t *pixel = (uint32_t *)bits + x;
-    const uint32_t *end = pixel + width;
-    uint64_t *buffer = (uint64_t *)b;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint64_t b = (p >> 20) & 0x3ff;
-	uint64_t g = (p >> 10) & 0x3ff;
-	uint64_t r = p & 0x3ff;
-	
-	r = r << 6 | r >> 4;
-	g = g << 6 | g >> 4;
-	b = b << 6 | b >> 4;
-	
-	*buffer++ = 0xffffULL << 48 | r << 32 | g << 16 | b;
-    }
-}
-
-static void
-fetch_scanline_r8g8b8 (pixman_image_t *image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       uint32_t *      buffer,
-                       const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint8_t *pixel = (const uint8_t *)bits + 3 * x;
-    const uint8_t *end = pixel + 3 * width;
-    
-    while (pixel < end)
-    {
-	uint32_t b = 0xff000000;
-	
-#ifdef WORDS_BIGENDIAN
-	b |= (READ (image, pixel++) << 16);
-	b |= (READ (image, pixel++) << 8);
-	b |= (READ (image, pixel++));
-#else
-	b |= (READ (image, pixel++));
-	b |= (READ (image, pixel++) << 8);
-	b |= (READ (image, pixel++) << 16);
-#endif
-	
-	*buffer++ = b;
-    }
-}
-
-static void
-fetch_scanline_b8g8r8 (pixman_image_t *image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       uint32_t *      buffer,
-                       const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint8_t *pixel = (const uint8_t *)bits + 3 * x;
-    const uint8_t *end = pixel + 3 * width;
-    
-    while (pixel < end)
-    {
-	uint32_t b = 0xff000000;
-#ifdef WORDS_BIGENDIAN
-	b |= (READ (image, pixel++));
-	b |= (READ (image, pixel++) << 8);
-	b |= (READ (image, pixel++) << 16);
-#else
-	b |= (READ (image, pixel++) << 16);
-	b |= (READ (image, pixel++) << 8);
-	b |= (READ (image, pixel++));
-#endif
-	*buffer++ = b;
-    }
-}
-
-static void
-fetch_scanline_r5g6b5 (pixman_image_t *image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       uint32_t *      buffer,
-                       const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint16_t *pixel = (const uint16_t *)bits + x;
-    const uint16_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t r = (((p) << 3) & 0xf8) |
-	    (((p) << 5) & 0xfc00) |
-	    (((p) << 8) & 0xf80000);
-	
-	r |= (r >> 5) & 0x70007;
-	r |= (r >> 6) & 0x300;
-	
-	*buffer++ = 0xff000000 | r;
-    }
-}
-
-static void
-fetch_scanline_b5g6r5 (pixman_image_t *image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       uint32_t *      buffer,
-                       const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint16_t *pixel = (const uint16_t *)bits + x;
-    const uint16_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t r, g, b;
-	
-	b = ((p & 0xf800) | ((p & 0xe000) >> 5)) >> 8;
-	g = ((p & 0x07e0) | ((p & 0x0600) >> 6)) << 5;
-	r = ((p & 0x001c) | ((p & 0x001f) << 5)) << 14;
-	
-	*buffer++ = 0xff000000 | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_a1r5g5b5 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint16_t *pixel = (const uint16_t *)bits + x;
-    const uint16_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t r, g, b, a;
-	
-	a = (uint32_t) ((uint8_t) (0 - ((p & 0x8000) >> 15))) << 24;
-	r = ((p & 0x7c00) | ((p & 0x7000) >> 5)) << 9;
-	g = ((p & 0x03e0) | ((p & 0x0380) >> 5)) << 6;
-	b = ((p & 0x001c) | ((p & 0x001f) << 5)) >> 2;
-	
-	*buffer++ = a | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_x1r5g5b5 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint16_t *pixel = (const uint16_t *)bits + x;
-    const uint16_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t r, g, b;
-	
-	r = ((p & 0x7c00) | ((p & 0x7000) >> 5)) << 9;
-	g = ((p & 0x03e0) | ((p & 0x0380) >> 5)) << 6;
-	b = ((p & 0x001c) | ((p & 0x001f) << 5)) >> 2;
-	
-	*buffer++ = 0xff000000 | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_a1b5g5r5 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint16_t *pixel = (const uint16_t *)bits + x;
-    const uint16_t *end = pixel + width;
-    uint32_t r, g, b, a;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	
-	a = (uint32_t) ((uint8_t) (0 - ((p & 0x8000) >> 15))) << 24;
-	b = ((p & 0x7c00) | ((p & 0x7000) >> 5)) >> 7;
-	g = ((p & 0x03e0) | ((p & 0x0380) >> 5)) << 6;
-	r = ((p & 0x001c) | ((p & 0x001f) << 5)) << 14;
-	
-	*buffer++ = a | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_x1b5g5r5 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint16_t *pixel = (const uint16_t *)bits + x;
-    const uint16_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t r, g, b;
-	
-	b = ((p & 0x7c00) | ((p & 0x7000) >> 5)) >> 7;
-	g = ((p & 0x03e0) | ((p & 0x0380) >> 5)) << 6;
-	r = ((p & 0x001c) | ((p & 0x001f) << 5)) << 14;
-	
-	*buffer++ = 0xff000000 | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_a4r4g4b4 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint16_t *pixel = (const uint16_t *)bits + x;
-    const uint16_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t r, g, b, a;
-	
-	a = ((p & 0xf000) | ((p & 0xf000) >> 4)) << 16;
-	r = ((p & 0x0f00) | ((p & 0x0f00) >> 4)) << 12;
-	g = ((p & 0x00f0) | ((p & 0x00f0) >> 4)) << 8;
-	b = ((p & 0x000f) | ((p & 0x000f) << 4));
-	
-	*buffer++ = a | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_x4r4g4b4 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint16_t *pixel = (const uint16_t *)bits + x;
-    const uint16_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t r, g, b;
-	
-	r = ((p & 0x0f00) | ((p & 0x0f00) >> 4)) << 12;
-	g = ((p & 0x00f0) | ((p & 0x00f0) >> 4)) << 8;
-	b = ((p & 0x000f) | ((p & 0x000f) << 4));
-	
-	*buffer++ = 0xff000000 | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_a4b4g4r4 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint16_t *pixel = (const uint16_t *)bits + x;
-    const uint16_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t r, g, b, a;
-	
-	a = ((p & 0xf000) | ((p & 0xf000) >> 4)) << 16;
-	b = ((p & 0x0f00) | ((p & 0x0f00) >> 4)) >> 4;
-	g = ((p & 0x00f0) | ((p & 0x00f0) >> 4)) << 8;
-	r = ((p & 0x000f) | ((p & 0x000f) << 4)) << 16;
-	
-	*buffer++ = a | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_x4b4g4r4 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint16_t *pixel = (const uint16_t *)bits + x;
-    const uint16_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t r, g, b;
-	
-	b = ((p & 0x0f00) | ((p & 0x0f00) >> 4)) >> 4;
-	g = ((p & 0x00f0) | ((p & 0x00f0) >> 4)) << 8;
-	r = ((p & 0x000f) | ((p & 0x000f) << 4)) << 16;
-	
-	*buffer++ = 0xff000000 | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_a8 (pixman_image_t *image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   uint32_t *      buffer,
-                   const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint8_t *pixel = (const uint8_t *)bits + x;
-    const uint8_t *end = pixel + width;
-    
-    while (pixel < end)
-	*buffer++ = READ (image, pixel++) << 24;
-}
-
-static void
-fetch_scanline_r3g3b2 (pixman_image_t *image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       uint32_t *      buffer,
-                       const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint8_t *pixel = (const uint8_t *)bits + x;
-    const uint8_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t r, g, b;
-	
-	r = ((p & 0xe0) | ((p & 0xe0) >> 3) | ((p & 0xc0) >> 6)) << 16;
-	g = ((p & 0x1c) | ((p & 0x18) >> 3) | ((p & 0x1c) << 3)) << 8;
-	b = (((p & 0x03)     ) |
-	     ((p & 0x03) << 2) |
-	     ((p & 0x03) << 4) |
-	     ((p & 0x03) << 6));
-	
-	*buffer++ = 0xff000000 | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_b2g3r3 (pixman_image_t *image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       uint32_t *      buffer,
-                       const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint8_t *pixel = (const uint8_t *)bits + x;
-    const uint8_t *end = pixel + width;
-
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t r, g, b;
-
-	b  = p & 0xc0;
-	b |= b >> 2;
-	b |= b >> 4;
-	b &= 0xff;
-
-	g  = (p & 0x38) << 10;
-	g |= g >> 3;
-	g |= g >> 6;
-	g &= 0xff00;
-
-	r  = (p & 0x7) << 21;
-	r |= r >> 3;
-	r |= r >> 6;
-	r &= 0xff0000;
-
-	*buffer++ = 0xff000000 | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_a2r2g2b2 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint8_t *pixel = (const uint8_t *)bits + x;
-    const uint8_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t a, r, g, b;
-	
-	a = ((p & 0xc0) * 0x55) << 18;
-	r = ((p & 0x30) * 0x55) << 12;
-	g = ((p & 0x0c) * 0x55) << 6;
-	b = ((p & 0x03) * 0x55);
-	
-	*buffer++ = a | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_a2b2g2r2 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint8_t *pixel = (const uint8_t *)bits + x;
-    const uint8_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	uint32_t a, r, g, b;
-	
-	a = ((p & 0xc0) * 0x55) << 18;
-	b = ((p & 0x30) * 0x55) >> 4;
-	g = ((p & 0x0c) * 0x55) << 6;
-	r = ((p & 0x03) * 0x55) << 16;
-	
-	*buffer++ = a | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_c8 (pixman_image_t *image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   uint32_t *      buffer,
-                   const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const pixman_indexed_t * indexed = image->bits.indexed;
-    const uint8_t *pixel = (const uint8_t *)bits + x;
-    const uint8_t *end = pixel + width;
-    
-    while (pixel < end)
-    {
-	uint32_t p = READ (image, pixel++);
-	
-	*buffer++ = indexed->rgba[p];
-    }
-}
-
-static void
-fetch_scanline_x4a4 (pixman_image_t *image,
-                     int             x,
-                     int             y,
-                     int             width,
-                     uint32_t *      buffer,
-                     const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const uint8_t *pixel = (const uint8_t *)bits + x;
-    const uint8_t *end = pixel + width;
-   
-    while (pixel < end)
-    {
-	uint8_t p = READ (image, pixel++) & 0xf;
-
-	*buffer++ = (p | (p << 4)) << 24;
-    }
-}
-
-#define FETCH_8(img,l,o)    (READ (img, (((uint8_t *)(l)) + ((o) >> 3))))
-#ifdef WORDS_BIGENDIAN
-#define FETCH_4(img,l,o)						\
-    (((4 * (o)) & 4) ? (FETCH_8 (img,l, 4 * (o)) & 0xf) : (FETCH_8 (img,l,(4 * (o))) >> 4))
-#else
-#define FETCH_4(img,l,o)						\
-    (((4 * (o)) & 4) ? (FETCH_8 (img, l, 4 * (o)) >> 4) : (FETCH_8 (img, l, (4 * (o))) & 0xf))
-#endif
-
-static void
-fetch_scanline_a4 (pixman_image_t *image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   uint32_t *      buffer,
-                   const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t p = FETCH_4 (image, bits, i + x);
-
-	p |= p << 4;
-
-	*buffer++ = p << 24;
-    }
-}
-
-static void
-fetch_scanline_r1g2b1 (pixman_image_t *image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       uint32_t *      buffer,
-                       const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t p = FETCH_4 (image, bits, i + x);
-	uint32_t r, g, b;
-	
-	r = ((p & 0x8) * 0xff) << 13;
-	g = ((p & 0x6) * 0x55) << 7;
-	b = ((p & 0x1) * 0xff);
-	
-	*buffer++ = 0xff000000 | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_b1g2r1 (pixman_image_t *image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       uint32_t *      buffer,
-                       const uint32_t *mask)
-{
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t p = FETCH_4 (image, bits, i + x);
-	uint32_t r, g, b;
-	
-	b = ((p & 0x8) * 0xff) >> 3;
-	g = ((p & 0x6) * 0x55) << 7;
-	r = ((p & 0x1) * 0xff) << 16;
-
-	*buffer++ = 0xff000000 | r | g | b;
-    }
-}
-
-static void
-fetch_scanline_a1r1g1b1 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
+static force_inline uint32_t
+convert_pixel (pixman_format_code_t from, pixman_format_code_t to, uint32_t pixel)
 {
+    int a_from_shift, r_from_shift, g_from_shift, b_from_shift;
+    int a_to_shift, r_to_shift, g_to_shift, b_to_shift;
     uint32_t a, r, g, b;
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    int i;
 
-    for (i = 0; i < width; ++i)
+    get_shifts (from, &a_from_shift, &r_from_shift, &g_from_shift, &b_from_shift);
+    get_shifts (to, &a_to_shift, &r_to_shift, &g_to_shift, &b_to_shift);
+
+    a = convert_channel (pixel, ~0,
+			 PIXMAN_FORMAT_A (from), a_from_shift,
+			 PIXMAN_FORMAT_A (to), a_to_shift);
+
+    r = convert_channel (pixel, 0,
+			 PIXMAN_FORMAT_R (from), r_from_shift,
+			 PIXMAN_FORMAT_R (to), r_to_shift);
+
+    g = convert_channel (pixel, 0,
+			 PIXMAN_FORMAT_G (from), g_from_shift,
+			 PIXMAN_FORMAT_G (to), g_to_shift);
+
+    b = convert_channel (pixel, 0,
+			 PIXMAN_FORMAT_B (from), b_from_shift,
+			 PIXMAN_FORMAT_B (to), b_to_shift);
+
+    return a | r | g | b;
+}
+
+static force_inline uint32_t
+convert_pixel_to_a8r8g8b8 (pixman_image_t *image,
+			   pixman_format_code_t format,
+			   uint32_t pixel)
+{
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY		||
+	PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR)
     {
-	uint32_t p = FETCH_4 (image, bits, i + x);
-
-	a = ((p & 0x8) * 0xff) << 21;
-	r = ((p & 0x4) * 0xff) << 14;
-	g = ((p & 0x2) * 0xff) << 7;
-	b = ((p & 0x1) * 0xff);
-
-	*buffer++ = a | r | g | b;
+	return image->bits.indexed->rgba[pixel];
+    }
+    else
+    {
+	return convert_pixel (format, PIXMAN_a8r8g8b8, pixel);
     }
 }
 
-static void
-fetch_scanline_a1b1g1r1 (pixman_image_t *image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         uint32_t *      buffer,
-                         const uint32_t *mask)
+static force_inline uint32_t
+convert_pixel_from_a8r8g8b8 (pixman_image_t *image,
+			     pixman_format_code_t format, uint32_t pixel)
 {
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    int i;
-
-    for (i = 0; i < width; ++i)
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY)
     {
-	uint32_t p = FETCH_4 (image, bits, i + x);
-	uint32_t a, r, g, b;
+	pixel = CONVERT_RGB24_TO_Y15 (pixel);
 
-	a = ((p & 0x8) * 0xff) << 21;
-	b = ((p & 0x4) * 0xff) >> 2;
-	g = ((p & 0x2) * 0xff) << 7;
-	r = ((p & 0x1) * 0xff) << 16;
+	return image->bits.indexed->ent[pixel & 0x7fff];
+    }
+    else if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR)
+    {
+	pixel = convert_pixel (PIXMAN_a8r8g8b8, PIXMAN_x1r5g5b5, pixel);
 
-	*buffer++ = a | r | g | b;
+	return image->bits.indexed->ent[pixel & 0x7fff];
+    }
+    else
+    {
+	return convert_pixel (PIXMAN_a8r8g8b8, format, pixel);
     }
 }
 
-static void
-fetch_scanline_c4 (pixman_image_t *image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   uint32_t *      buffer,
-                   const uint32_t *mask)
+static force_inline uint32_t
+fetch_and_convert_pixel (pixman_image_t	*	image,
+			 const uint8_t *	bits,
+			 int			offset,
+			 pixman_format_code_t	format)
 {
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const pixman_indexed_t * indexed = image->bits.indexed;
-    int i;
-    
-    for (i = 0; i < width; ++i)
+    uint32_t pixel;
+
+    switch (PIXMAN_FORMAT_BPP (format))
     {
-	uint32_t p = FETCH_4 (image, bits, i + x);
-	
-	*buffer++ = indexed->rgba[p];
+    case 1:
+	pixel = FETCH_1 (image, bits, offset);
+	break;
+
+    case 4:
+	pixel = FETCH_4 (image, bits, offset);
+	break;
+
+    case 8:
+	pixel = READ (image, bits + offset);
+	break;
+
+    case 16:
+	pixel = READ (image, ((uint16_t *)bits + offset));
+	break;
+
+    case 24:
+	pixel = FETCH_24 (image, bits, offset);
+	break;
+
+    case 32:
+	pixel = READ (image, ((uint32_t *)bits + offset));
+	break;
+
+    default:
+	pixel = 0xffff00ff; /* As ugly as possible to detect the bug */
+	break;
+    }
+
+    return convert_pixel_to_a8r8g8b8 (image, format, pixel);
+}
+
+static force_inline void
+convert_and_store_pixel (bits_image_t *		image,
+			 uint8_t *		dest,
+			 int                    offset,
+			 pixman_format_code_t	format,
+			 uint32_t		pixel)
+{
+    uint32_t converted = convert_pixel_from_a8r8g8b8 (
+	(pixman_image_t *)image, format, pixel);
+
+    switch (PIXMAN_FORMAT_BPP (format))
+    {
+    case 1:
+	STORE_1 (image, dest, offset, converted & 0x01);
+	break;
+
+    case 4:
+	STORE_4 (image, dest, offset, converted & 0xf);
+	break;
+
+    case 8:
+	WRITE (image, (dest + offset), converted & 0xff);
+	break;
+
+    case 16:
+	WRITE (image, ((uint16_t *)dest + offset), converted & 0xffff);
+	break;
+
+    case 24:
+	STORE_24 (image, dest, offset, converted);
+	break;
+
+    case 32:
+	WRITE (image, ((uint32_t *)dest + offset), converted);
+	break;
+
+    default:
+	*dest = 0x0;
+	break;
     }
 }
 
-static void
-fetch_scanline_a1 (pixman_image_t *image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   uint32_t *      buffer,
-                   const uint32_t *mask)
+#define MAKE_ACCESSORS(format)						\
+    static void								\
+    fetch_scanline_ ## format (pixman_image_t *image,			\
+			       int	       x,			\
+			       int             y,			\
+			       int             width,			\
+			       uint32_t *      buffer,			\
+			       const uint32_t *mask)			\
+    {									\
+	uint8_t *bits =							\
+	    (uint8_t *)(image->bits.bits + y * image->bits.rowstride);	\
+	int i;								\
+									\
+	for (i = 0; i < width; ++i)					\
+	{								\
+	    *buffer++ =							\
+		fetch_and_convert_pixel (image, bits, x + i, PIXMAN_ ## format); \
+	}								\
+    }									\
+									\
+    static void								\
+    store_scanline_ ## format (bits_image_t *  image,			\
+			       int             x,			\
+			       int             y,			\
+			       int             width,			\
+			       const uint32_t *values)			\
+    {									\
+	uint8_t *dest =							\
+	    (uint8_t *)(image->bits + y * image->rowstride);		\
+	int i;								\
+									\
+	for (i = 0; i < width; ++i)					\
+	{								\
+	    convert_and_store_pixel (					\
+		image, dest, i + x, PIXMAN_ ## format, values[i]);	\
+	}								\
+    }									\
+									\
+    static uint32_t							\
+    fetch_pixel_ ## format (bits_image_t *image,			\
+			    int		offset,				\
+			    int		line)				\
+    {									\
+	uint8_t *bits =							\
+	    (uint8_t *)(image->bits + line * image->rowstride);		\
+									\
+	return fetch_and_convert_pixel ((pixman_image_t *)image,	\
+					bits, offset, PIXMAN_ ## format); \
+    }									\
+									\
+    static const void *const __dummy__ ## format
+
+MAKE_ACCESSORS(a8r8g8b8);
+MAKE_ACCESSORS(x8r8g8b8);
+MAKE_ACCESSORS(a8b8g8r8);
+MAKE_ACCESSORS(x8b8g8r8);
+MAKE_ACCESSORS(x14r6g6b6);
+MAKE_ACCESSORS(b8g8r8a8);
+MAKE_ACCESSORS(b8g8r8x8);
+MAKE_ACCESSORS(r8g8b8x8);
+MAKE_ACCESSORS(r8g8b8a8);
+MAKE_ACCESSORS(r8g8b8);
+MAKE_ACCESSORS(b8g8r8);
+MAKE_ACCESSORS(r5g6b5);
+MAKE_ACCESSORS(b5g6r5);
+MAKE_ACCESSORS(a1r5g5b5);
+MAKE_ACCESSORS(x1r5g5b5);
+MAKE_ACCESSORS(a1b5g5r5);
+MAKE_ACCESSORS(x1b5g5r5);
+MAKE_ACCESSORS(a4r4g4b4);
+MAKE_ACCESSORS(x4r4g4b4);
+MAKE_ACCESSORS(a4b4g4r4);
+MAKE_ACCESSORS(x4b4g4r4);
+MAKE_ACCESSORS(a8);
+MAKE_ACCESSORS(c8);
+MAKE_ACCESSORS(g8);
+MAKE_ACCESSORS(r3g3b2);
+MAKE_ACCESSORS(b2g3r3);
+MAKE_ACCESSORS(a2r2g2b2);
+MAKE_ACCESSORS(a2b2g2r2);
+MAKE_ACCESSORS(x4a4);
+MAKE_ACCESSORS(a4);
+MAKE_ACCESSORS(g4);
+MAKE_ACCESSORS(c4);
+MAKE_ACCESSORS(r1g2b1);
+MAKE_ACCESSORS(b1g2r1);
+MAKE_ACCESSORS(a1r1g1b1);
+MAKE_ACCESSORS(a1b1g1r1);
+MAKE_ACCESSORS(a1);
+MAKE_ACCESSORS(g1);
+
+/********************************** Fetch ************************************/
+/* Table mapping sRGB-encoded 8 bit numbers to linearly encoded
+ * floating point numbers. We assume that single precision
+ * floating point follows the IEEE 754 format.
+ */
+static const uint32_t to_linear_u[256] =
 {
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    int i;
-    
-    for (i = 0; i < width; ++i)
+    0x00000000, 0x399f22b4, 0x3a1f22b4, 0x3a6eb40e, 0x3a9f22b4, 0x3ac6eb61,
+    0x3aeeb40e, 0x3b0b3e5d, 0x3b1f22b4, 0x3b33070b, 0x3b46eb61, 0x3b5b518a,
+    0x3b70f18a, 0x3b83e1c5, 0x3b8fe614, 0x3b9c87fb, 0x3ba9c9b5, 0x3bb7ad6d,
+    0x3bc63547, 0x3bd5635f, 0x3be539bd, 0x3bf5ba70, 0x3c0373b5, 0x3c0c6152,
+    0x3c15a703, 0x3c1f45bc, 0x3c293e68, 0x3c3391f4, 0x3c3e4149, 0x3c494d43,
+    0x3c54b6c7, 0x3c607eb1, 0x3c6ca5df, 0x3c792d22, 0x3c830aa8, 0x3c89af9e,
+    0x3c9085db, 0x3c978dc5, 0x3c9ec7c0, 0x3ca63432, 0x3cadd37d, 0x3cb5a601,
+    0x3cbdac20, 0x3cc5e639, 0x3cce54ab, 0x3cd6f7d2, 0x3cdfd00e, 0x3ce8ddb9,
+    0x3cf2212c, 0x3cfb9ac1, 0x3d02a569, 0x3d0798dc, 0x3d0ca7e4, 0x3d11d2ae,
+    0x3d171963, 0x3d1c7c2e, 0x3d21fb3a, 0x3d2796af, 0x3d2d4ebb, 0x3d332380,
+    0x3d39152b, 0x3d3f23e3, 0x3d454fd0, 0x3d4b991c, 0x3d51ffeb, 0x3d588466,
+    0x3d5f26b7, 0x3d65e6fe, 0x3d6cc564, 0x3d73c210, 0x3d7add25, 0x3d810b65,
+    0x3d84b793, 0x3d88732e, 0x3d8c3e48, 0x3d9018f4, 0x3d940343, 0x3d97fd48,
+    0x3d9c0714, 0x3da020b9, 0x3da44a48, 0x3da883d6, 0x3daccd70, 0x3db12728,
+    0x3db59110, 0x3dba0b38, 0x3dbe95b2, 0x3dc3308f, 0x3dc7dbe0, 0x3dcc97b4,
+    0x3dd1641c, 0x3dd6412a, 0x3ddb2eec, 0x3de02d75, 0x3de53cd3, 0x3dea5d16,
+    0x3def8e52, 0x3df4d091, 0x3dfa23e5, 0x3dff885e, 0x3e027f06, 0x3e05427f,
+    0x3e080ea2, 0x3e0ae376, 0x3e0dc104, 0x3e10a752, 0x3e139669, 0x3e168e50,
+    0x3e198f0e, 0x3e1c98ab, 0x3e1fab2e, 0x3e22c6a0, 0x3e25eb08, 0x3e29186a,
+    0x3e2c4ed0, 0x3e2f8e42, 0x3e32d6c4, 0x3e362861, 0x3e39831e, 0x3e3ce702,
+    0x3e405416, 0x3e43ca5e, 0x3e4749e4, 0x3e4ad2ae, 0x3e4e64c2, 0x3e520027,
+    0x3e55a4e6, 0x3e595303, 0x3e5d0a8a, 0x3e60cb7c, 0x3e6495e0, 0x3e6869bf,
+    0x3e6c4720, 0x3e702e08, 0x3e741e7f, 0x3e78188c, 0x3e7c1c34, 0x3e8014c0,
+    0x3e822039, 0x3e84308b, 0x3e8645b8, 0x3e885fc3, 0x3e8a7eb0, 0x3e8ca281,
+    0x3e8ecb3a, 0x3e90f8df, 0x3e932b72, 0x3e9562f6, 0x3e979f6f, 0x3e99e0e0,
+    0x3e9c274e, 0x3e9e72b8, 0x3ea0c322, 0x3ea31892, 0x3ea57308, 0x3ea7d28a,
+    0x3eaa3718, 0x3eaca0b7, 0x3eaf0f69, 0x3eb18332, 0x3eb3fc16, 0x3eb67a15,
+    0x3eb8fd34, 0x3ebb8576, 0x3ebe12de, 0x3ec0a56e, 0x3ec33d2a, 0x3ec5da14,
+    0x3ec87c30, 0x3ecb2380, 0x3ecdd008, 0x3ed081ca, 0x3ed338c9, 0x3ed5f508,
+    0x3ed8b68a, 0x3edb7d52, 0x3ede4962, 0x3ee11abe, 0x3ee3f168, 0x3ee6cd64,
+    0x3ee9aeb6, 0x3eec955d, 0x3eef815d, 0x3ef272ba, 0x3ef56976, 0x3ef86594,
+    0x3efb6717, 0x3efe6e02, 0x3f00bd2b, 0x3f02460c, 0x3f03d1a5, 0x3f055ff8,
+    0x3f06f105, 0x3f0884ce, 0x3f0a1b54, 0x3f0bb499, 0x3f0d509f, 0x3f0eef65,
+    0x3f1090ef, 0x3f12353c, 0x3f13dc50, 0x3f15862a, 0x3f1732cc, 0x3f18e237,
+    0x3f1a946d, 0x3f1c4970, 0x3f1e013f, 0x3f1fbbde, 0x3f21794c, 0x3f23398c,
+    0x3f24fca0, 0x3f26c286, 0x3f288b42, 0x3f2a56d3, 0x3f2c253d, 0x3f2df680,
+    0x3f2fca9d, 0x3f31a195, 0x3f337b6a, 0x3f35581e, 0x3f3737b1, 0x3f391a24,
+    0x3f3aff7a, 0x3f3ce7b2, 0x3f3ed2d0, 0x3f40c0d2, 0x3f42b1bc, 0x3f44a58e,
+    0x3f469c49, 0x3f4895ee, 0x3f4a9280, 0x3f4c91ff, 0x3f4e946c, 0x3f5099c8,
+    0x3f52a216, 0x3f54ad55, 0x3f56bb88, 0x3f58ccae, 0x3f5ae0cb, 0x3f5cf7de,
+    0x3f5f11ec, 0x3f612ef0, 0x3f634eef, 0x3f6571ea, 0x3f6797e1, 0x3f69c0d6,
+    0x3f6beccb, 0x3f6e1bc0, 0x3f704db6, 0x3f7282af, 0x3f74baac, 0x3f76f5ae,
+    0x3f7933b6, 0x3f7b74c6, 0x3f7db8de, 0x3f800000
+};
+
+static const float * const to_linear = (const float *)to_linear_u;
+
+static uint8_t
+to_srgb (float f)
+{
+    uint8_t low = 0;
+    uint8_t high = 255;
+
+    while (high - low > 1)
     {
-	uint32_t p = READ (image, bits + ((i + x) >> 5));
-	uint32_t a;
-	
-#ifdef WORDS_BIGENDIAN
-	a = p >> (0x1f - ((i + x) & 0x1f));
-#else
-	a = p >> ((i + x) & 0x1f);
-#endif
-	a = a & 1;
-	a |= a << 1;
-	a |= a << 2;
-	a |= a << 4;
-	
-	*buffer++ = a << 24;
+	uint8_t mid = (low + high) / 2;
+
+	if (to_linear[mid] > f)
+	    high = mid;
+	else
+	    low = mid;
     }
+
+    if (to_linear[high] - f < f - to_linear[low])
+	return high;
+    else
+	return low;
 }
 
 static void
-fetch_scanline_g1 (pixman_image_t *image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   uint32_t *      buffer,
-                   const uint32_t *mask)
+fetch_scanline_a8r8g8b8_sRGB_float (pixman_image_t *image,
+				    int             x,
+				    int             y,
+				    int             width,
+				    uint32_t *      b,
+				    const uint32_t *mask)
 {
     const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
-    const pixman_indexed_t * indexed = image->bits.indexed;
-    int i;
-    
-    for (i = 0; i < width; ++i)
+    const uint32_t *pixel = bits + x;
+    const uint32_t *end = pixel + width;
+    argb_t *buffer = (argb_t *)b;
+
+    while (pixel < end)
     {
-	uint32_t p = READ (image, bits + ((i + x) >> 5));
-	uint32_t a;
-	
-#ifdef WORDS_BIGENDIAN
-	a = p >> (0x1f - ((i + x) & 0x1f));
-#else
-	a = p >> ((i + x) & 0x1f);
-#endif
-	a = a & 1;
-	
-	*buffer++ = indexed->rgba[a];
+	uint32_t p = READ (image, pixel++);
+	argb_t *argb = buffer;
+
+	argb->a = pixman_unorm_to_float ((p >> 24) & 0xff, 8);
+
+	argb->r = to_linear [(p >> 16) & 0xff];
+	argb->g = to_linear [(p >>  8) & 0xff];
+	argb->b = to_linear [(p >>  0) & 0xff];
+
+	buffer++;
+    }
+}
+
+/* Expects a float buffer */
+static void
+fetch_scanline_a2r10g10b10_float (pixman_image_t *image,
+				  int             x,
+				  int             y,
+				  int             width,
+				  uint32_t *      b,
+				  const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = bits + x;
+    const uint32_t *end = pixel + width;
+    argb_t *buffer = (argb_t *)b;
+
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint64_t a = p >> 30;
+	uint64_t r = (p >> 20) & 0x3ff;
+	uint64_t g = (p >> 10) & 0x3ff;
+	uint64_t b = p & 0x3ff;
+
+	buffer->a = pixman_unorm_to_float (a, 2);
+	buffer->r = pixman_unorm_to_float (r, 10);
+	buffer->g = pixman_unorm_to_float (g, 10);
+	buffer->b = pixman_unorm_to_float (b, 10);
+
+	buffer++;
+    }
+}
+
+/* Expects a float buffer */
+static void
+fetch_scanline_x2r10g10b10_float (pixman_image_t *image,
+				  int             x,
+				  int             y,
+				  int             width,
+				  uint32_t *      b,
+				  const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = (uint32_t *)bits + x;
+    const uint32_t *end = pixel + width;
+    argb_t *buffer = (argb_t *)b;
+
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint64_t r = (p >> 20) & 0x3ff;
+	uint64_t g = (p >> 10) & 0x3ff;
+	uint64_t b = p & 0x3ff;
+
+	buffer->a = 1.0;
+	buffer->r = pixman_unorm_to_float (r, 10);
+	buffer->g = pixman_unorm_to_float (g, 10);
+	buffer->b = pixman_unorm_to_float (b, 10);
+
+	buffer++;
+    }
+}
+
+/* Expects a float buffer */
+static void
+fetch_scanline_a2b10g10r10_float (pixman_image_t *image,
+				  int             x,
+				  int             y,
+				  int             width,
+				  uint32_t *      b,
+				  const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = bits + x;
+    const uint32_t *end = pixel + width;
+    argb_t *buffer = (argb_t *)b;
+
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint64_t a = p >> 30;
+	uint64_t b = (p >> 20) & 0x3ff;
+	uint64_t g = (p >> 10) & 0x3ff;
+	uint64_t r = p & 0x3ff;
+
+	buffer->a = pixman_unorm_to_float (a, 2);
+	buffer->r = pixman_unorm_to_float (r, 10);
+	buffer->g = pixman_unorm_to_float (g, 10);
+	buffer->b = pixman_unorm_to_float (b, 10);
+
+	buffer++;
+    }
+}
+
+/* Expects a float buffer */
+static void
+fetch_scanline_x2b10g10r10_float (pixman_image_t *image,
+				  int             x,
+				  int             y,
+				  int             width,
+				  uint32_t *      b,
+				  const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = (uint32_t *)bits + x;
+    const uint32_t *end = pixel + width;
+    argb_t *buffer = (argb_t *)b;
+
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint64_t b = (p >> 20) & 0x3ff;
+	uint64_t g = (p >> 10) & 0x3ff;
+	uint64_t r = p & 0x3ff;
+
+	buffer->a = 1.0;
+	buffer->r = pixman_unorm_to_float (r, 10);
+	buffer->g = pixman_unorm_to_float (g, 10);
+	buffer->b = pixman_unorm_to_float (b, 10);
+
+	buffer++;
     }
 }
 
@@ -1128,11 +805,30 @@ fetch_scanline_yv12 (pixman_image_t *image,
 
 /**************************** Pixel wise fetching *****************************/
 
-/* Despite the type, expects a uint64_t buffer */
-static uint64_t
-fetch_pixel_a2r10g10b10 (bits_image_t *image,
-			 int		  offset,
-			 int           line)
+static argb_t
+fetch_pixel_x2r10g10b10_float (bits_image_t *image,
+			       int	   offset,
+			       int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t p = READ (image, bits + offset);
+    uint64_t r = (p >> 20) & 0x3ff;
+    uint64_t g = (p >> 10) & 0x3ff;
+    uint64_t b = p & 0x3ff;
+    argb_t argb;
+
+    argb.a = 1.0;
+    argb.r = pixman_unorm_to_float (r, 10);
+    argb.g = pixman_unorm_to_float (g, 10);
+    argb.b = pixman_unorm_to_float (b, 10);
+
+    return argb;
+}
+
+static argb_t
+fetch_pixel_a2r10g10b10_float (bits_image_t *image,
+			       int	     offset,
+			       int           line)
 {
     uint32_t *bits = image->bits + line * image->rowstride;
     uint32_t p = READ (image, bits + offset);
@@ -1140,43 +836,20 @@ fetch_pixel_a2r10g10b10 (bits_image_t *image,
     uint64_t r = (p >> 20) & 0x3ff;
     uint64_t g = (p >> 10) & 0x3ff;
     uint64_t b = p & 0x3ff;
+    argb_t argb;
 
-    r = r << 6 | r >> 4;
-    g = g << 6 | g >> 4;
-    b = b << 6 | b >> 4;
+    argb.a = pixman_unorm_to_float (a, 2);
+    argb.r = pixman_unorm_to_float (r, 10);
+    argb.g = pixman_unorm_to_float (g, 10);
+    argb.b = pixman_unorm_to_float (b, 10);
 
-    a <<= 14;
-    a |= a >> 2;
-    a |= a >> 4;
-    a |= a >> 8;
-
-    return a << 48 | r << 32 | g << 16 | b;
+    return argb;
 }
 
-/* Despite the type, this function expects a uint64_t buffer */
-static uint64_t
-fetch_pixel_x2r10g10b10 (bits_image_t *image,
-			 int	   offset,
-			 int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t p = READ (image, bits + offset);
-    uint64_t r = (p >> 20) & 0x3ff;
-    uint64_t g = (p >> 10) & 0x3ff;
-    uint64_t b = p & 0x3ff;
-    
-    r = r << 6 | r >> 4;
-    g = g << 6 | g >> 4;
-    b = b << 6 | b >> 4;
-    
-    return 0xffffULL << 48 | r << 32 | g << 16 | b;
-}
-
-/* Despite the type, expects a uint64_t buffer */
-static uint64_t
-fetch_pixel_a2b10g10r10 (bits_image_t *image,
-			 int           offset,
-			 int           line)
+static argb_t
+fetch_pixel_a2b10g10r10_float (bits_image_t *image,
+			       int           offset,
+			       int           line)
 {
     uint32_t *bits = image->bits + line * image->rowstride;
     uint32_t p = READ (image, bits + offset);
@@ -1184,584 +857,52 @@ fetch_pixel_a2b10g10r10 (bits_image_t *image,
     uint64_t b = (p >> 20) & 0x3ff;
     uint64_t g = (p >> 10) & 0x3ff;
     uint64_t r = p & 0x3ff;
-    
-    r = r << 6 | r >> 4;
-    g = g << 6 | g >> 4;
-    b = b << 6 | b >> 4;
-    
-    a <<= 14;
-    a |= a >> 2;
-    a |= a >> 4;
-    a |= a >> 8;
-    
-    return a << 48 | r << 32 | g << 16 | b;
+    argb_t argb;
+
+    argb.a = pixman_unorm_to_float (a, 2);
+    argb.r = pixman_unorm_to_float (r, 10);
+    argb.g = pixman_unorm_to_float (g, 10);
+    argb.b = pixman_unorm_to_float (b, 10);
+
+    return argb;
 }
 
-/* Despite the type, this function expects a uint64_t buffer */
-static uint64_t
-fetch_pixel_x2b10g10r10 (bits_image_t *image,
-			 int           offset,
-			 int           line)
+static argb_t
+fetch_pixel_x2b10g10r10_float (bits_image_t *image,
+			       int           offset,
+			       int           line)
 {
     uint32_t *bits = image->bits + line * image->rowstride;
     uint32_t p = READ (image, bits + offset);
     uint64_t b = (p >> 20) & 0x3ff;
     uint64_t g = (p >> 10) & 0x3ff;
     uint64_t r = p & 0x3ff;
-    
-    r = r << 6 | r >> 4;
-    g = g << 6 | g >> 4;
-    b = b << 6 | b >> 4;
-    
-    return 0xffffULL << 48 | r << 32 | g << 16 | b;
+    argb_t argb;
+
+    argb.a = 1.0;
+    argb.r = pixman_unorm_to_float (r, 10);
+    argb.g = pixman_unorm_to_float (g, 10);
+    argb.b = pixman_unorm_to_float (b, 10);
+
+    return argb;
 }
 
-static uint32_t
-fetch_pixel_a8r8g8b8 (bits_image_t *image,
-		      int           offset,
-		      int           line)
+static argb_t
+fetch_pixel_a8r8g8b8_sRGB_float (bits_image_t *image,
+				 int	       offset,
+				 int           line)
 {
     uint32_t *bits = image->bits + line * image->rowstride;
-    return READ (image, (uint32_t *)bits + offset);
-}
+    uint32_t p = READ (image, bits + offset);
+    argb_t argb;
 
-static uint32_t
-fetch_pixel_x8r8g8b8 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
+    argb.a = pixman_unorm_to_float ((p >> 24) & 0xff, 8);
 
-    return READ (image, (uint32_t *)bits + offset) | 0xff000000;
-}
+    argb.r = to_linear [(p >> 16) & 0xff];
+    argb.g = to_linear [(p >>  8) & 0xff];
+    argb.b = to_linear [(p >>  0) & 0xff];
 
-static uint32_t
-fetch_pixel_a8b8g8r8 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint32_t *)bits + offset);
-    
-    return ((pixel & 0xff000000) |
-	    ((pixel >> 16) & 0xff) |
-	    (pixel & 0x0000ff00) |
-	    ((pixel & 0xff) << 16));
-}
-
-static uint32_t
-fetch_pixel_x8b8g8r8 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint32_t *)bits + offset);
-    
-    return ((0xff000000) |
-	    ((pixel >> 16) & 0xff) |
-	    (pixel & 0x0000ff00) |
-	    ((pixel & 0xff) << 16));
-}
-
-static uint32_t
-fetch_pixel_b8g8r8a8 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint32_t *)bits + offset);
-    
-    return ((pixel & 0xff000000) >> 24 |
-	    (pixel & 0x00ff0000) >> 8 |
-	    (pixel & 0x0000ff00) << 8 |
-	    (pixel & 0x000000ff) << 24);
-}
-
-static uint32_t
-fetch_pixel_b8g8r8x8 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint32_t *)bits + offset);
-    
-    return ((0xff000000) |
-	    (pixel & 0xff000000) >> 24 |
-	    (pixel & 0x00ff0000) >> 8 |
-	    (pixel & 0x0000ff00) << 8);
-}
-
-static uint32_t
-fetch_pixel_x14r6g6b6 (bits_image_t *image,
-                       int           offset,
-                       int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint32_t *) bits + offset);
-    uint32_t r, g, b;
-
-    r = ((pixel & 0x3f000) << 6) | ((pixel & 0x30000));
-    g = ((pixel & 0x00fc0) << 4) | ((pixel & 0x00c00) >> 2);
-    b = ((pixel & 0x0003f) << 2) | ((pixel & 0x00030) >> 4);
-
-    return 0xff000000 | r | g | b;
-}
-
-static uint32_t
-fetch_pixel_r8g8b8 (bits_image_t *image,
-		    int           offset,
-		    int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint8_t   *pixel = ((uint8_t *) bits) + (offset * 3);
-    
-#ifdef WORDS_BIGENDIAN
-    return (0xff000000 |
-	    (READ (image, pixel + 0) << 16) |
-	    (READ (image, pixel + 1) << 8) |
-	    (READ (image, pixel + 2)));
-#else
-    return (0xff000000 |
-	    (READ (image, pixel + 2) << 16) |
-	    (READ (image, pixel + 1) << 8) |
-	    (READ (image, pixel + 0)));
-#endif
-}
-
-static uint32_t
-fetch_pixel_b8g8r8 (bits_image_t *image,
-		    int           offset,
-		    int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint8_t   *pixel = ((uint8_t *) bits) + (offset * 3);
-#ifdef WORDS_BIGENDIAN
-    return (0xff000000 |
-	    (READ (image, pixel + 2) << 16) |
-	    (READ (image, pixel + 1) << 8) |
-	    (READ (image, pixel + 0)));
-#else
-    return (0xff000000 |
-	    (READ (image, pixel + 0) << 16) |
-	    (READ (image, pixel + 1) << 8) |
-	    (READ (image, pixel + 2)));
-#endif
-}
-
-static uint32_t
-fetch_pixel_r5g6b5 (bits_image_t *image,
-		    int           offset,
-		    int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
-    uint32_t r, g, b;
-    
-    r = ((pixel & 0xf800) | ((pixel & 0xe000) >> 5)) << 8;
-    g = ((pixel & 0x07e0) | ((pixel & 0x0600) >> 6)) << 5;
-    b = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) >> 2;
-    
-    return (0xff000000 | r | g | b);
-}
-
-static uint32_t
-fetch_pixel_b5g6r5 (bits_image_t *image,
-		    int           offset,
-		    int           line)
-{
-    uint32_t r, g, b;
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
-    
-    b = ((pixel & 0xf800) | ((pixel & 0xe000) >> 5)) >> 8;
-    g = ((pixel & 0x07e0) | ((pixel & 0x0600) >> 6)) << 5;
-    r = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) << 14;
-    
-    return (0xff000000 | r | g | b);
-}
-
-static uint32_t
-fetch_pixel_a1r5g5b5 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
-    uint32_t a, r, g, b;
-    
-    a = (uint32_t) ((uint8_t) (0 - ((pixel & 0x8000) >> 15))) << 24;
-    r = ((pixel & 0x7c00) | ((pixel & 0x7000) >> 5)) << 9;
-    g = ((pixel & 0x03e0) | ((pixel & 0x0380) >> 5)) << 6;
-    b = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) >> 2;
-    
-    return (a | r | g | b);
-}
-
-static uint32_t
-fetch_pixel_x1r5g5b5 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
-    uint32_t r, g, b;
-    
-    r = ((pixel & 0x7c00) | ((pixel & 0x7000) >> 5)) << 9;
-    g = ((pixel & 0x03e0) | ((pixel & 0x0380) >> 5)) << 6;
-    b = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) >> 2;
-    
-    return (0xff000000 | r | g | b);
-}
-
-static uint32_t
-fetch_pixel_a1b5g5r5 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
-    uint32_t a, r, g, b;
-    
-    a = (uint32_t) ((uint8_t) (0 - ((pixel & 0x8000) >> 15))) << 24;
-    b = ((pixel & 0x7c00) | ((pixel & 0x7000) >> 5)) >> 7;
-    g = ((pixel & 0x03e0) | ((pixel & 0x0380) >> 5)) << 6;
-    r = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) << 14;
-    
-    return (a | r | g | b);
-}
-
-static uint32_t
-fetch_pixel_x1b5g5r5 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
-    uint32_t r, g, b;
-    
-    b = ((pixel & 0x7c00) | ((pixel & 0x7000) >> 5)) >> 7;
-    g = ((pixel & 0x03e0) | ((pixel & 0x0380) >> 5)) << 6;
-    r = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) << 14;
-    
-    return (0xff000000 | r | g | b);
-}
-
-static uint32_t
-fetch_pixel_a4r4g4b4 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
-    uint32_t a, r, g, b;
-    
-    a = ((pixel & 0xf000) | ((pixel & 0xf000) >> 4)) << 16;
-    r = ((pixel & 0x0f00) | ((pixel & 0x0f00) >> 4)) << 12;
-    g = ((pixel & 0x00f0) | ((pixel & 0x00f0) >> 4)) << 8;
-    b = ((pixel & 0x000f) | ((pixel & 0x000f) << 4));
-    
-    return (a | r | g | b);
-}
-
-static uint32_t
-fetch_pixel_x4r4g4b4 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
-    uint32_t r, g, b;
-    
-    r = ((pixel & 0x0f00) | ((pixel & 0x0f00) >> 4)) << 12;
-    g = ((pixel & 0x00f0) | ((pixel & 0x00f0) >> 4)) << 8;
-    b = ((pixel & 0x000f) | ((pixel & 0x000f) << 4));
-    
-    return (0xff000000 | r | g | b);
-}
-
-static uint32_t
-fetch_pixel_a4b4g4r4 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
-    uint32_t a, r, g, b;
-    
-    a = ((pixel & 0xf000) | ((pixel & 0xf000) >> 4)) << 16;
-    b = ((pixel & 0x0f00) | ((pixel & 0x0f00) >> 4)) >> 4;
-    g = ((pixel & 0x00f0) | ((pixel & 0x00f0) >> 4)) << 8;
-    r = ((pixel & 0x000f) | ((pixel & 0x000f) << 4)) << 16;
-    
-    return (a | r | g | b);
-}
-
-static uint32_t
-fetch_pixel_x4b4g4r4 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
-    uint32_t r, g, b;
-    
-    b = ((pixel & 0x0f00) | ((pixel & 0x0f00) >> 4)) >> 4;
-    g = ((pixel & 0x00f0) | ((pixel & 0x00f0) >> 4)) << 8;
-    r = ((pixel & 0x000f) | ((pixel & 0x000f) << 4)) << 16;
-    
-    return (0xff000000 | r | g | b);
-}
-
-static uint32_t
-fetch_pixel_a8 (bits_image_t *image,
-		int           offset,
-		int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
-    
-    return pixel << 24;
-}
-
-static uint32_t
-fetch_pixel_r3g3b2 (bits_image_t *image,
-		    int           offset,
-		    int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
-    uint32_t r, g, b;
-    
-    r = ((pixel & 0xe0) |
-	 ((pixel & 0xe0) >> 3) |
-	 ((pixel & 0xc0) >> 6)) << 16;
-    
-    g = ((pixel & 0x1c) |
-	 ((pixel & 0x18) >> 3) |
-	 ((pixel & 0x1c) << 3)) << 8;
-    
-    b = (((pixel & 0x03)     ) |
-	 ((pixel & 0x03) << 2) |
-	 ((pixel & 0x03) << 4) |
-	 ((pixel & 0x03) << 6));
-    
-    return (0xff000000 | r | g | b);
-}
-
-static uint32_t
-fetch_pixel_b2g3r3 (bits_image_t *image,
-		    int           offset,
-		    int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t p = READ (image, (uint8_t *) bits + offset);
-    uint32_t r, g, b;
-
-    b  = p & 0xc0;
-    b |= b >> 2;
-    b |= b >> 4;
-    b &= 0xff;
-
-    g  = (p & 0x38) << 10;
-    g |= g >> 3;
-    g |= g >> 6;
-    g &= 0xff00;
-
-    r  = (p & 0x7) << 21;
-    r |= r >> 3;
-    r |= r >> 6;
-    r &= 0xff0000;
-
-    return 0xff000000 | r | g | b;
-}
-
-static uint32_t
-fetch_pixel_a2r2g2b2 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
-    uint32_t a, r, g, b;
-    
-    a = ((pixel & 0xc0) * 0x55) << 18;
-    r = ((pixel & 0x30) * 0x55) << 12;
-    g = ((pixel & 0x0c) * 0x55) << 6;
-    b = ((pixel & 0x03) * 0x55);
-    
-    return a | r | g | b;
-}
-
-static uint32_t
-fetch_pixel_a2b2g2r2 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
-    uint32_t a, r, g, b;
-    
-    a = ((pixel & 0xc0) * 0x55) << 18;
-    b = ((pixel & 0x30) * 0x55) >> 4;
-    g = ((pixel & 0x0c) * 0x55) << 6;
-    r = ((pixel & 0x03) * 0x55) << 16;
-    
-    return a | r | g | b;
-}
-
-static uint32_t
-fetch_pixel_c8 (bits_image_t *image,
-		int           offset,
-		int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
-    const pixman_indexed_t * indexed = image->indexed;
-    
-    return indexed->rgba[pixel];
-}
-
-static uint32_t
-fetch_pixel_x4a4 (bits_image_t *image,
-		  int           offset,
-		  int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
-    
-    return ((pixel & 0xf) | ((pixel & 0xf) << 4)) << 24;
-}
-
-static uint32_t
-fetch_pixel_a4 (bits_image_t *image,
-		int           offset,
-		int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = FETCH_4 (image, bits, offset);
-    
-    pixel |= pixel << 4;
-    return pixel << 24;
-}
-
-static uint32_t
-fetch_pixel_r1g2b1 (bits_image_t *image,
-		    int           offset,
-		    int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = FETCH_4 (image, bits, offset);
-    uint32_t r, g, b;
-    
-    r = ((pixel & 0x8) * 0xff) << 13;
-    g = ((pixel & 0x6) * 0x55) << 7;
-    b = ((pixel & 0x1) * 0xff);
-    
-    return 0xff000000 | r | g | b;
-}
-
-static uint32_t
-fetch_pixel_b1g2r1 (bits_image_t *image,
-		    int           offset,
-		    int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = FETCH_4 (image, bits, offset);
-    uint32_t r, g, b;
-    
-    b = ((pixel & 0x8) * 0xff) >> 3;
-    g = ((pixel & 0x6) * 0x55) << 7;
-    r = ((pixel & 0x1) * 0xff) << 16;
-    
-    return 0xff000000 | r | g | b;
-}
-
-static uint32_t
-fetch_pixel_a1r1g1b1 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = FETCH_4 (image, bits, offset);
-    uint32_t a, r, g, b;
-
-    a = ((pixel & 0x8) * 0xff) << 21;
-    r = ((pixel & 0x4) * 0xff) << 14;
-    g = ((pixel & 0x2) * 0xff) << 7;
-    b = ((pixel & 0x1) * 0xff);
-
-    return a | r | g | b;
-}
-
-static uint32_t
-fetch_pixel_a1b1g1r1 (bits_image_t *image,
-		      int           offset,
-		      int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = FETCH_4 (image, bits, offset);
-    uint32_t a, r, g, b;
-
-    a = ((pixel & 0x8) * 0xff) << 21;
-    b = ((pixel & 0x4) * 0xff) >> 2;
-    g = ((pixel & 0x2) * 0xff) << 7;
-    r = ((pixel & 0x1) * 0xff) << 16;
-
-    return a | r | g | b;
-}
-
-static uint32_t
-fetch_pixel_c4 (bits_image_t *image,
-		int           offset,
-		int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = FETCH_4 (image, bits, offset);
-    const pixman_indexed_t * indexed = image->indexed;
-
-    return indexed->rgba[pixel];
-}
-
-static uint32_t
-fetch_pixel_a1 (bits_image_t *image,
-		int           offset,
-		int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, bits + (offset >> 5));
-    uint32_t a;
-    
-#ifdef WORDS_BIGENDIAN
-    a = pixel >> (0x1f - (offset & 0x1f));
-#else
-    a = pixel >> (offset & 0x1f);
-#endif
-    a = a & 1;
-    a |= a << 1;
-    a |= a << 2;
-    a |= a << 4;
-    
-    return a << 24;
-}
-
-static uint32_t
-fetch_pixel_g1 (bits_image_t *image,
-		int           offset,
-		int           line)
-{
-    uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, bits + (offset >> 5));
-    const pixman_indexed_t * indexed = image->indexed;
-    uint32_t a;
-    
-#ifdef WORDS_BIGENDIAN
-    a = pixel >> (0x1f - (offset & 0x1f));
-#else
-    a = pixel >> (offset & 0x1f);
-#endif
-    a = a & 1;
-    
-    return indexed->rgba[a];
+    return argb;
 }
 
 static uint32_t
@@ -1821,980 +962,276 @@ fetch_pixel_yv12 (bits_image_t *image,
 
 /*********************************** Store ************************************/
 
-#define SPLIT_A(v)              \
-    uint32_t a = ((v) >> 24),   \
-	r = ((v) >> 16) & 0xff, \
-	g = ((v) >> 8) & 0xff,  \
-	b = (v) & 0xff
-
-#define SPLIT(v)                     \
-    uint32_t r = ((v) >> 16) & 0xff, \
-	g = ((v) >> 8) & 0xff,       \
-	b = (v) & 0xff
-
 static void
-store_scanline_a2r10g10b10 (bits_image_t *  image,
-                            int             x,
-                            int             y,
-                            int             width,
-                            const uint32_t *v)
+store_scanline_a2r10g10b10_float (bits_image_t *  image,
+				  int             x,
+				  int             y,
+				  int             width,
+				  const uint32_t *v)
 {
     uint32_t *bits = image->bits + image->rowstride * y;
     uint32_t *pixel = bits + x;
-    uint64_t *values = (uint64_t *)v;
+    argb_t *values = (argb_t *)v;
     int i;
-    
+
     for (i = 0; i < width; ++i)
     {
+	uint16_t a, r, g, b;
+
+	a = pixman_float_to_unorm (values[i].a, 2);
+	r = pixman_float_to_unorm (values[i].r, 10);
+	g = pixman_float_to_unorm (values[i].g, 10);
+	b = pixman_float_to_unorm (values[i].b, 10);
+
 	WRITE (image, pixel++,
-	       ((values[i] >> 32) & 0xc0000000) |
-	       ((values[i] >> 18) & 0x3ff00000) |
-	       ((values[i] >> 12) & 0xffc00) | 
-	       ((values[i] >> 6) & 0x3ff));    
+	       (a << 30) | (r << 20) | (g << 10) | b);
     }
 }
 
 static void
-store_scanline_x2r10g10b10 (bits_image_t *  image,
-                            int             x,
-                            int             y,
-                            int             width,
-                            const uint32_t *v)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint64_t *values = (uint64_t *)v;
-    uint32_t *pixel = bits + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	WRITE (image, pixel++,
-	       ((values[i] >> 18) & 0x3ff00000) | 
-	       ((values[i] >> 12) & 0xffc00) |
-	       ((values[i] >> 6) & 0x3ff));
-    }
-}
-
-static void
-store_scanline_a2b10g10r10 (bits_image_t *  image,
-                            int             x,
-                            int             y,
-                            int             width,
-                            const uint32_t *v)
+store_scanline_x2r10g10b10_float (bits_image_t *  image,
+				  int             x,
+				  int             y,
+				  int             width,
+				  const uint32_t *v)
 {
     uint32_t *bits = image->bits + image->rowstride * y;
     uint32_t *pixel = bits + x;
-    uint64_t *values = (uint64_t *)v;
+    argb_t *values = (argb_t *)v;
     int i;
-    
+
     for (i = 0; i < width; ++i)
     {
+	uint16_t r, g, b;
+
+	r = pixman_float_to_unorm (values[i].r, 10);
+	g = pixman_float_to_unorm (values[i].g, 10);
+	b = pixman_float_to_unorm (values[i].b, 10);
+
 	WRITE (image, pixel++,
-	       ((values[i] >> 32) & 0xc0000000) |
-	       ((values[i] >> 38) & 0x3ff) |
-	       ((values[i] >> 12) & 0xffc00) |
-	       ((values[i] << 14) & 0x3ff00000));
+	       (r << 20) | (g << 10) | b);
     }
 }
 
 static void
-store_scanline_x2b10g10r10 (bits_image_t *  image,
-                            int             x,
-                            int             y,
-                            int             width,
-                            const uint32_t *v)
+store_scanline_a2b10g10r10_float (bits_image_t *  image,
+				  int             x,
+				  int             y,
+				  int             width,
+				  const uint32_t *v)
 {
     uint32_t *bits = image->bits + image->rowstride * y;
-    uint64_t *values = (uint64_t *)v;
     uint32_t *pixel = bits + x;
+    argb_t *values = (argb_t *)v;
     int i;
-    
+
     for (i = 0; i < width; ++i)
     {
+	uint16_t a, r, g, b;
+
+	a = pixman_float_to_unorm (values[i].a, 2);
+	r = pixman_float_to_unorm (values[i].r, 10);
+	g = pixman_float_to_unorm (values[i].g, 10);
+	b = pixman_float_to_unorm (values[i].b, 10);
+
 	WRITE (image, pixel++,
-	       ((values[i] >> 38) & 0x3ff) |
-	       ((values[i] >> 12) & 0xffc00) |
-	       ((values[i] << 14) & 0x3ff00000));
+	       (a << 30) | (b << 20) | (g << 10) | r);
     }
 }
 
 static void
-store_scanline_a8r8g8b8 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
+store_scanline_x2b10g10r10_float (bits_image_t *  image,
+				  int             x,
+				  int             y,
+				  int             width,
+				  const uint32_t *v)
 {
     uint32_t *bits = image->bits + image->rowstride * y;
-    
-    MEMCPY_WRAPPED (image, ((uint32_t *)bits) + x, values,
-                    width * sizeof(uint32_t));
-}
-
-static void
-store_scanline_x8r8g8b8 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint32_t *pixel = (uint32_t *)bits + x;
+    uint32_t *pixel = bits + x;
+    argb_t *values = (argb_t *)v;
     int i;
-    
-    for (i = 0; i < width; ++i)
-	WRITE (image, pixel++, values[i] & 0xffffff);
-}
 
-static void
-store_scanline_a8b8g8r8 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint32_t *pixel = (uint32_t *)bits + x;
-    int i;
-    
     for (i = 0; i < width; ++i)
     {
+	uint16_t r, g, b;
+
+	r = pixman_float_to_unorm (values[i].r, 10);
+	g = pixman_float_to_unorm (values[i].g, 10);
+	b = pixman_float_to_unorm (values[i].b, 10);
+
 	WRITE (image, pixel++,
-	       (values[i] & 0xff00ff00)         |
-	       ((values[i] >> 16) & 0xff)       |
-	       ((values[i] & 0xff) << 16));
+	       (b << 20) | (g << 10) | r);
     }
 }
 
 static void
-store_scanline_x8b8g8r8 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
+store_scanline_a8r8g8b8_sRGB_float (bits_image_t *  image,
+				    int             x,
+				    int             y,
+				    int             width,
+				    const uint32_t *v)
 {
     uint32_t *bits = image->bits + image->rowstride * y;
-    uint32_t *pixel = (uint32_t *)bits + x;
+    uint32_t *pixel = bits + x;
+    argb_t *values = (argb_t *)v;
     int i;
-    
+
     for (i = 0; i < width; ++i)
     {
+	uint8_t a, r, g, b;
+
+	a = pixman_float_to_unorm (values[i].a, 8);
+	r = to_srgb (values[i].r);
+	g = to_srgb (values[i].g);
+	b = to_srgb (values[i].b);
+
 	WRITE (image, pixel++,
-	       (values[i] & 0x0000ff00)         |
-	       ((values[i] >> 16) & 0xff)       |
-	       ((values[i] & 0xff) << 16));
-    }
-}
-
-static void
-store_scanline_b8g8r8a8 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint32_t *pixel = (uint32_t *)bits + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	WRITE (image, pixel++,
-	       ((values[i] >> 24) & 0x000000ff) |
-	       ((values[i] >>  8) & 0x0000ff00) |
-	       ((values[i] <<  8) & 0x00ff0000) |
-	       ((values[i] << 24) & 0xff000000));
-    }
-}
-
-static void
-store_scanline_b8g8r8x8 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint32_t *pixel = (uint32_t *)bits + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	WRITE (image, pixel++,
-	       ((values[i] >>  8) & 0x0000ff00) |
-	       ((values[i] <<  8) & 0x00ff0000) |
-	       ((values[i] << 24) & 0xff000000));
-    }
-}
-
-static void
-store_scanline_x14r6g6b6 (bits_image_t *  image,
-                          int             x,
-                          int             y,
-                          int             width,
-                          const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint32_t *pixel = ((uint32_t *) bits) + x;
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t s = values[i];
-	uint32_t r, g, b;
-
-	r = (s & 0xfc0000) >> 6;
-	g = (s & 0x00fc00) >> 4;
-	b = (s & 0x0000fc) >> 2;
-
-	WRITE (image, pixel++, r | g | b);
-    }
-}
-
-static void
-store_scanline_r8g8b8 (bits_image_t *  image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint8_t *pixel = ((uint8_t *) bits) + 3 * x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t val = values[i];
-	
-#ifdef WORDS_BIGENDIAN
-	WRITE (image, pixel++, (val & 0x00ff0000) >> 16);
-	WRITE (image, pixel++, (val & 0x0000ff00) >>  8);
-	WRITE (image, pixel++, (val & 0x000000ff) >>  0);
-#else
-	WRITE (image, pixel++, (val & 0x000000ff) >>  0);
-	WRITE (image, pixel++, (val & 0x0000ff00) >>  8);
-	WRITE (image, pixel++, (val & 0x00ff0000) >> 16);
-#endif
-    }
-}
-
-static void
-store_scanline_b8g8r8 (bits_image_t *  image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint8_t *pixel = ((uint8_t *) bits) + 3 * x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t val = values[i];
-	
-#ifdef WORDS_BIGENDIAN
-	WRITE (image, pixel++, (val & 0x000000ff) >>  0);
-	WRITE (image, pixel++, (val & 0x0000ff00) >>  8);
-	WRITE (image, pixel++, (val & 0x00ff0000) >> 16);
-#else
-	WRITE (image, pixel++, (val & 0x00ff0000) >> 16);
-	WRITE (image, pixel++, (val & 0x0000ff00) >>  8);
-	WRITE (image, pixel++, (val & 0x000000ff) >>  0);
-#endif
-    }
-}
-
-static void
-store_scanline_r5g6b5 (bits_image_t *  image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint16_t *pixel = ((uint16_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t s = values[i];
-	
-	WRITE (image, pixel++,
-	       ((s >> 3) & 0x001f) |
-	       ((s >> 5) & 0x07e0) |
-	       ((s >> 8) & 0xf800));
-    }
-}
-
-static void
-store_scanline_b5g6r5 (bits_image_t *  image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint16_t  *pixel = ((uint16_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT (values[i]);
-	
-	WRITE (image, pixel++,
-	       ((b << 8) & 0xf800) |
-	       ((g << 3) & 0x07e0) |
-	       ((r >> 3)         ));
-    }
-}
-
-static void
-store_scanline_a1r5g5b5 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint16_t  *pixel = ((uint16_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT_A (values[i]);
-	
-	WRITE (image, pixel++,
-	       ((a << 8) & 0x8000) |
-	       ((r << 7) & 0x7c00) |
-	       ((g << 2) & 0x03e0) |
-	       ((b >> 3)         ));
-    }
-}
-
-static void
-store_scanline_x1r5g5b5 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint16_t  *pixel = ((uint16_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT (values[i]);
-	
-	WRITE (image, pixel++,
-	       ((r << 7) & 0x7c00) |
-	       ((g << 2) & 0x03e0) |
-	       ((b >> 3)         ));
-    }
-}
-
-static void
-store_scanline_a1b5g5r5 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint16_t  *pixel = ((uint16_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT_A (values[i]);
-	
-	WRITE (image, pixel++,
-	       ((a << 8) & 0x8000) |
-	       ((b << 7) & 0x7c00) |
-	       ((g << 2) & 0x03e0) |
-	       ((r >> 3)         ));
-    }
-}
-
-static void
-store_scanline_x1b5g5r5 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint16_t  *pixel = ((uint16_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT (values[i]);
-	
-	WRITE (image, pixel++, ((b << 7) & 0x7c00) |
-	       ((g << 2) & 0x03e0) |
-	       ((r >> 3)         ));
-    }
-}
-
-static void
-store_scanline_a4r4g4b4 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint16_t  *pixel = ((uint16_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT_A (values[i]);
-	
-	WRITE (image, pixel++,
-	       ((a << 8) & 0xf000) |
-	       ((r << 4) & 0x0f00) |
-	       ((g     ) & 0x00f0) |
-	       ((b >> 4)         ));
-    }
-}
-
-static void
-store_scanline_x4r4g4b4 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint16_t  *pixel = ((uint16_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT (values[i]);
-	
-	WRITE (image, pixel++,
-	       ((r << 4) & 0x0f00) |
-	       ((g     ) & 0x00f0) |
-	       ((b >> 4)         ));
-    }
-}
-
-static void
-store_scanline_a4b4g4r4 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint16_t  *pixel = ((uint16_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT_A (values[i]);
-	WRITE (image, pixel++, ((a << 8) & 0xf000) |
-	       ((b << 4) & 0x0f00) |
-	       ((g     ) & 0x00f0) |
-	       ((r >> 4)         ));
-    }
-}
-
-static void
-store_scanline_x4b4g4r4 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint16_t  *pixel = ((uint16_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT (values[i]);
-	
-	WRITE (image, pixel++,
-	       ((b << 4) & 0x0f00) |
-	       ((g     ) & 0x00f0) |
-	       ((r >> 4)         ));
-    }
-}
-
-static void
-store_scanline_a8 (bits_image_t *  image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint8_t   *pixel = ((uint8_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	WRITE (image, pixel++, values[i] >> 24);
-    }
-}
-
-static void
-store_scanline_r3g3b2 (bits_image_t *  image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint8_t   *pixel = ((uint8_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT (values[i]);
-	
-	WRITE (image, pixel++,
-	       ((r     ) & 0xe0) |
-	       ((g >> 3) & 0x1c) |
-	       ((b >> 6)       ));
-    }
-}
-
-static void
-store_scanline_b2g3r3 (bits_image_t *  image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint8_t   *pixel = ((uint8_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT (values[i]);
-	
-	WRITE (image, pixel++,
-	       ((b     ) & 0xc0) |
-	       ((g >> 2) & 0x38) |
-	       ((r >> 5)       ));
-    }
-}
-
-static void
-store_scanline_a2r2g2b2 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint8_t   *pixel = ((uint8_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT_A (values[i]);
-	
-	WRITE (image, pixel++,
-	       ((a     ) & 0xc0) |
-	       ((r >> 2) & 0x30) |
-	       ((g >> 4) & 0x0c) |
-	       ((b >> 6)       ));
-    }
-}
-
-static void
-store_scanline_a2b2g2r2 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint8_t   *pixel = ((uint8_t *) bits) + x;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	SPLIT_A (values[i]);
-	
-	WRITE (image, pixel++,
-	       ((a     ) & 0xc0) |
-	       ((b >> 2) & 0x30) |
-	       ((g >> 4) & 0x0c) |
-	       ((r >> 6)       ));
-    }
-}
-
-static void
-store_scanline_c8 (bits_image_t *  image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint8_t *pixel = ((uint8_t *) bits) + x;
-    const pixman_indexed_t *indexed = image->indexed;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-	WRITE (image, pixel++, RGB24_TO_ENTRY (indexed,values[i]));
-}
-
-static void
-store_scanline_g8 (bits_image_t *  image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint8_t *pixel = ((uint8_t *) bits) + x;
-    const pixman_indexed_t *indexed = image->indexed;
-    int i;
-
-    for (i = 0; i < width; ++i)
-	WRITE (image, pixel++, RGB24_TO_ENTRY_Y (indexed,values[i]));
-}
-
-static void
-store_scanline_x4a4 (bits_image_t *  image,
-                     int             x,
-                     int             y,
-                     int             width,
-                     const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    uint8_t   *pixel = ((uint8_t *) bits) + x;
-    int i;
-
-    for (i = 0; i < width; ++i)
-	WRITE (image, pixel++, values[i] >> 28);
-}
-
-#define STORE_8(img,l,o,v)  (WRITE (img, (uint8_t *)(l) + ((o) >> 3), (v)))
-#ifdef WORDS_BIGENDIAN
-
-#define STORE_4(img,l,o,v)						\
-    do									\
-    {									\
-	int bo = 4 * (o);						\
-	int v4 = (v) & 0x0f;						\
-									\
-	STORE_8 (img, l, bo, (						\
-		     bo & 4 ?						\
-		     (FETCH_8 (img, l, bo) & 0xf0) | (v4) :		\
-		     (FETCH_8 (img, l, bo) & 0x0f) | (v4 << 4)));	\
-    } while (0)
-#else
-
-#define STORE_4(img,l,o,v)						\
-    do									\
-    {									\
-	int bo = 4 * (o);						\
-	int v4 = (v) & 0x0f;						\
-									\
-	STORE_8 (img, l, bo, (						\
-		     bo & 4 ?						\
-		     (FETCH_8 (img, l, bo) & 0x0f) | (v4 << 4) :	\
-		     (FETCH_8 (img, l, bo) & 0xf0) | (v4)));		\
-    } while (0)
-#endif
-
-static void
-store_scanline_a4 (bits_image_t *  image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    int i;
-
-    for (i = 0; i < width; ++i)
-	STORE_4 (image, bits, i + x, values[i] >> 28);
-}
-
-static void
-store_scanline_r1g2b1 (bits_image_t *  image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t pixel;
-
-	SPLIT (values[i]);
-	pixel = (((r >> 4) & 0x8) |
-	         ((g >> 5) & 0x6) |
-	         ((b >> 7)      ));
-	STORE_4 (image, bits, i + x, pixel);
-    }
-}
-
-static void
-store_scanline_b1g2r1 (bits_image_t *  image,
-                       int             x,
-                       int             y,
-                       int             width,
-                       const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t pixel;
-
-	SPLIT (values[i]);
-	pixel = (((b >> 4) & 0x8) |
-	         ((g >> 5) & 0x6) |
-	         ((r >> 7)      ));
-	STORE_4 (image, bits, i + x, pixel);
-    }
-}
-
-static void
-store_scanline_a1r1g1b1 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t pixel;
-
-	SPLIT_A (values[i]);
-	pixel = (((a >> 4) & 0x8) |
-	         ((r >> 5) & 0x4) |
-	         ((g >> 6) & 0x2) |
-	         ((b >> 7)      ));
-
-	STORE_4 (image, bits, i + x, pixel);
-    }
-}
-
-static void
-store_scanline_a1b1g1r1 (bits_image_t *  image,
-                         int             x,
-                         int             y,
-                         int             width,
-                         const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t pixel;
-
-	SPLIT_A (values[i]);
-	pixel = (((a >> 4) & 0x8) |
-	         ((b >> 5) & 0x4) |
-	         ((g >> 6) & 0x2) |
-	         ((r >> 7)      ));
-
-	STORE_4 (image, bits, i + x, pixel);
-    }
-}
-
-static void
-store_scanline_c4 (bits_image_t *  image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    const pixman_indexed_t *indexed = image->indexed;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t pixel;
-	
-	pixel = RGB24_TO_ENTRY (indexed, values[i]);
-	STORE_4 (image, bits, i + x, pixel);
-    }
-}
-
-static void
-store_scanline_g4 (bits_image_t *  image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    const pixman_indexed_t *indexed = image->indexed;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t pixel;
-	
-	pixel = RGB24_TO_ENTRY_Y (indexed, values[i]);
-	STORE_4 (image, bits, i + x, pixel);
-    }
-}
-
-static void
-store_scanline_a1 (bits_image_t *  image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t  *pixel = ((uint32_t *) bits) + ((i + x) >> 5);
-	uint32_t mask, v;
-	
-#ifdef WORDS_BIGENDIAN
-	mask = 1 << (0x1f - ((i + x) & 0x1f));
-#else
-	mask = 1 << ((i + x) & 0x1f);
-#endif
-	v = values[i] & 0x80000000 ? mask : 0;
-	
-	WRITE (image, pixel, (READ (image, pixel) & ~mask) | v);
-    }
-}
-
-static void
-store_scanline_g1 (bits_image_t *  image,
-                   int             x,
-                   int             y,
-                   int             width,
-                   const uint32_t *values)
-{
-    uint32_t *bits = image->bits + image->rowstride * y;
-    const pixman_indexed_t *indexed = image->indexed;
-    int i;
-    
-    for (i = 0; i < width; ++i)
-    {
-	uint32_t  *pixel = ((uint32_t *) bits) + ((i + x) >> 5);
-	uint32_t mask, v;
-	
-#ifdef WORDS_BIGENDIAN
-	mask = 1 << (0x1f - ((i + x) & 0x1f));
-#else
-	mask = 1 << ((i + x) & 0x1f);
-#endif
-	v = RGB24_TO_ENTRY_Y (indexed, values[i]) & 0x1 ? mask : 0;
-	
-	WRITE (image, pixel, (READ (image, pixel) & ~mask) | v);
+	       (a << 24) | (r << 16) | (g << 8) | b);
     }
 }
 
 /*
- * Contracts a 64bpp image to 32bpp and then stores it using a regular 32-bit
- * store proc. Despite the type, this function expects a uint64_t buffer.
+ * Contracts a floating point image to 32bpp and then stores it using a
+ * regular 32-bit store proc. Despite the type, this function expects an
+ * argb_t buffer.
  */
 static void
-store_scanline_generic_64 (bits_image_t *  image,
-                           int             x,
-                           int             y,
-                           int             width,
-                           const uint32_t *values)
+store_scanline_generic_float (bits_image_t *  image,
+			      int             x,
+			      int             y,
+			      int             width,
+			      const uint32_t *values)
 {
     uint32_t *argb8_pixels;
-    
+
     assert (image->common.type == BITS);
-    
+
     argb8_pixels = pixman_malloc_ab (width, sizeof(uint32_t));
     if (!argb8_pixels)
 	return;
-    
+
     /* Contract the scanline.  We could do this in place if values weren't
      * const.
      */
-    pixman_contract (argb8_pixels, (uint64_t *)values, width);
-    
+    pixman_contract_from_float (argb8_pixels, (argb_t *)values, width);
+
     image->store_scanline_32 (image, x, y, width, argb8_pixels);
-    
+
     free (argb8_pixels);
 }
 
-/* Despite the type, this function expects both buffer
- * and mask to be uint64_t
- */
 static void
-fetch_scanline_generic_64 (pixman_image_t *image,
-                           int             x,
-                           int             y,
-                           int             width,
-                           uint32_t *      buffer,
-                           const uint32_t *mask)
+fetch_scanline_generic_float (pixman_image_t *image,
+			      int	      x,
+			      int	      y,
+			      int	      width,
+			      uint32_t *      buffer,
+			      const uint32_t *mask)
 {
-    pixman_format_code_t format;
-    
-    /* Fetch the pixels into the first half of buffer and then expand them in
-     * place.
-     */
     image->bits.fetch_scanline_32 (image, x, y, width, buffer, NULL);
 
-    format = image->bits.format;
-    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR	||
-	PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY)
-    {
-	/* Indexed formats are mapped to a8r8g8b8 with full
-	 * precision, so when expanding we shouldn't correct
-	 * for the width of the channels
-	 */
-	
-	format = PIXMAN_a8r8g8b8;
-    }
-    
-    pixman_expand ((uint64_t *)buffer, buffer, format, width);
+    pixman_expand_to_float ((argb_t *)buffer, buffer, image->bits.format, width);
 }
 
-/* Despite the type, this function expects a uint64_t *buffer */
-static uint64_t
-fetch_pixel_generic_64 (bits_image_t *image,
-			int	      offset,
-			int           line)
+/* The 32_sRGB paths should be deleted after narrow processing
+ * is no longer invoked for formats that are considered wide.
+ * (Also see fetch_pixel_generic_lossy_32) */
+static void
+fetch_scanline_a8r8g8b8_32_sRGB (pixman_image_t *image,
+                                 int             x,
+                                 int             y,
+                                 int             width,
+                                 uint32_t       *buffer,
+                                 const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = (uint32_t *)bits + x;
+    const uint32_t *end = pixel + width;
+    uint32_t tmp;
+    
+    while (pixel < end)
+    {
+	uint8_t a, r, g, b;
+
+	tmp = READ (image, pixel++);
+
+	a = (tmp >> 24) & 0xff;
+	r = (tmp >> 16) & 0xff;
+	g = (tmp >> 8) & 0xff;
+	b = (tmp >> 0) & 0xff;
+
+	r = to_linear[r] * 255.0f + 0.5f;
+	g = to_linear[g] * 255.0f + 0.5f;
+	b = to_linear[b] * 255.0f + 0.5f;
+
+	*buffer++ = (a << 24) | (r << 16) | (g << 8) | (b << 0);
+    }
+}
+
+static uint32_t
+fetch_pixel_a8r8g8b8_32_sRGB (bits_image_t *image,
+			      int           offset,
+			      int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t tmp = READ (image, bits + offset);
+    uint8_t a, r, g, b;
+
+    a = (tmp >> 24) & 0xff;
+    r = (tmp >> 16) & 0xff;
+    g = (tmp >> 8) & 0xff;
+    b = (tmp >> 0) & 0xff;
+
+    r = to_linear[r] * 255.0f + 0.5f;
+    g = to_linear[g] * 255.0f + 0.5f;
+    b = to_linear[b] * 255.0f + 0.5f;
+
+    return (a << 24) | (r << 16) | (g << 8) | (b << 0);
+}
+
+static void
+store_scanline_a8r8g8b8_32_sRGB (bits_image_t   *image,
+                                 int             x,
+                                 int             y,
+                                 int             width,
+                                 const uint32_t *v)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint64_t *values = (uint64_t *)v;
+    uint32_t *pixel = bits + x;
+    uint64_t tmp;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint8_t a, r, g, b;
+
+	tmp = values[i];
+
+	a = (tmp >> 24) & 0xff;
+	r = (tmp >> 16) & 0xff;
+	g = (tmp >> 8) & 0xff;
+	b = (tmp >> 0) & 0xff;
+
+	r = to_srgb (r * (1/255.0f));
+	g = to_srgb (g * (1/255.0f));
+	b = to_srgb (b * (1/255.0f));
+	
+	WRITE (image, pixel++, a | (r << 16) | (g << 8) | (b << 0));
+    }
+}
+
+static argb_t
+fetch_pixel_generic_float (bits_image_t *image,
+			   int		 offset,
+			   int           line)
 {
     uint32_t pixel32 = image->fetch_pixel_32 (image, offset, line);
-    uint64_t result;
-    pixman_format_code_t format;
+    argb_t f;
 
-    format = image->format;
-    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR	||
-	PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY)
-    {
-	/* Indexed formats are mapped to a8r8g8b8 with full
-	 * precision, so when expanding we shouldn't correct
-	 * for the width of the channels
-	 */
-	
-	format = PIXMAN_a8r8g8b8;
-    }
-    
-    pixman_expand ((uint64_t *)&result, &pixel32, format, 1);
+    pixman_expand_to_float (&f, &pixel32, image->format, 1);
 
-    return result;
+    return f;
 }
 
 /*
@@ -2808,10 +1245,10 @@ fetch_pixel_generic_lossy_32 (bits_image_t *image,
 			      int           offset,
 			      int           line)
 {
-    uint64_t pixel64 = image->fetch_pixel_64 (image, offset, line);
+    argb_t pixel64 = image->fetch_pixel_float (image, offset, line);
     uint32_t result;
-    
-    pixman_contract (&result, &pixel64, 1);
+
+    pixman_contract_from_float (&result, &pixel64, 1);
 
     return result;
 }
@@ -2820,20 +1257,22 @@ typedef struct
 {
     pixman_format_code_t	format;
     fetch_scanline_t		fetch_scanline_32;
-    fetch_scanline_t		fetch_scanline_64;
+    fetch_scanline_t		fetch_scanline_float;
     fetch_pixel_32_t		fetch_pixel_32;
-    fetch_pixel_64_t		fetch_pixel_64;
+    fetch_pixel_float_t		fetch_pixel_float;
     store_scanline_t		store_scanline_32;
-    store_scanline_t		store_scanline_64;
+    store_scanline_t		store_scanline_float;
 } format_info_t;
 
 #define FORMAT_INFO(format) 						\
     {									\
 	PIXMAN_ ## format,						\
 	    fetch_scanline_ ## format,					\
-	    fetch_scanline_generic_64,					\
-	    fetch_pixel_ ## format, fetch_pixel_generic_64,		\
-	    store_scanline_ ## format, store_scanline_generic_64	\
+	    fetch_scanline_generic_float,				\
+	    fetch_pixel_ ## format,					\
+	    fetch_pixel_generic_float,					\
+	    store_scanline_ ## format,					\
+	    store_scanline_generic_float				\
     }
 
 static const format_info_t accessors[] =
@@ -2845,8 +1284,17 @@ static const format_info_t accessors[] =
     FORMAT_INFO (x8b8g8r8),
     FORMAT_INFO (b8g8r8a8),
     FORMAT_INFO (b8g8r8x8),
+    FORMAT_INFO (r8g8b8a8),
+    FORMAT_INFO (r8g8b8x8),
     FORMAT_INFO (x14r6g6b6),
 
+/* sRGB formats */
+  { PIXMAN_a8r8g8b8_sRGB,
+    fetch_scanline_a8r8g8b8_32_sRGB, fetch_scanline_a8r8g8b8_sRGB_float,
+    fetch_pixel_a8r8g8b8_32_sRGB, fetch_pixel_a8r8g8b8_sRGB_float,
+    store_scanline_a8r8g8b8_32_sRGB, store_scanline_a8r8g8b8_sRGB_float,
+  },
+
 /* 24bpp formats */
     FORMAT_INFO (r8g8b8),
     FORMAT_INFO (b8g8r8),
@@ -2873,8 +1321,6 @@ static const format_info_t accessors[] =
     
     FORMAT_INFO (c8),
     
-#define fetch_scanline_g8 fetch_scanline_c8
-#define fetch_pixel_g8 fetch_pixel_c8
     FORMAT_INFO (g8),
     
 #define fetch_scanline_x4c4 fetch_scanline_c8
@@ -2882,8 +1328,8 @@ static const format_info_t accessors[] =
 #define store_scanline_x4c4 store_scanline_c8
     FORMAT_INFO (x4c4),
     
-#define fetch_scanline_x4g4 fetch_scanline_c8
-#define fetch_pixel_x4g4 fetch_pixel_c8
+#define fetch_scanline_x4g4 fetch_scanline_g8
+#define fetch_pixel_x4g4 fetch_pixel_g8
 #define store_scanline_x4g4 store_scanline_g8
     FORMAT_INFO (x4g4),
     
@@ -2898,8 +1344,6 @@ static const format_info_t accessors[] =
     
     FORMAT_INFO (c4),
     
-#define fetch_scanline_g4 fetch_scanline_c4
-#define fetch_pixel_g4 fetch_pixel_c4
     FORMAT_INFO (g4),
     
 /* 1bpp formats */
@@ -2909,34 +1353,34 @@ static const format_info_t accessors[] =
 /* Wide formats */
     
     { PIXMAN_a2r10g10b10,
-      NULL, fetch_scanline_a2r10g10b10,
-      fetch_pixel_generic_lossy_32, fetch_pixel_a2r10g10b10,
-      NULL, store_scanline_a2r10g10b10 },
-    
+      NULL, fetch_scanline_a2r10g10b10_float,
+      fetch_pixel_generic_lossy_32, fetch_pixel_a2r10g10b10_float,
+      NULL, store_scanline_a2r10g10b10_float },
+
     { PIXMAN_x2r10g10b10,
-      NULL, fetch_scanline_x2r10g10b10,
-      fetch_pixel_generic_lossy_32, fetch_pixel_x2r10g10b10,
-      NULL, store_scanline_x2r10g10b10 },
-    
+      NULL, fetch_scanline_x2r10g10b10_float,
+      fetch_pixel_generic_lossy_32, fetch_pixel_x2r10g10b10_float,
+      NULL, store_scanline_x2r10g10b10_float },
+
     { PIXMAN_a2b10g10r10,
-      NULL, fetch_scanline_a2b10g10r10,
-      fetch_pixel_generic_lossy_32, fetch_pixel_a2b10g10r10,
-      NULL, store_scanline_a2b10g10r10 },
-    
+      NULL, fetch_scanline_a2b10g10r10_float,
+      fetch_pixel_generic_lossy_32, fetch_pixel_a2b10g10r10_float,
+      NULL, store_scanline_a2b10g10r10_float },
+
     { PIXMAN_x2b10g10r10,
-      NULL, fetch_scanline_x2b10g10r10,
-      fetch_pixel_generic_lossy_32, fetch_pixel_x2b10g10r10,
-      NULL, store_scanline_x2b10g10r10 },
-    
+      NULL, fetch_scanline_x2b10g10r10_float,
+      fetch_pixel_generic_lossy_32, fetch_pixel_x2b10g10r10_float,
+      NULL, store_scanline_x2b10g10r10_float },
+
 /* YUV formats */
     { PIXMAN_yuy2,
-      fetch_scanline_yuy2, fetch_scanline_generic_64,
-      fetch_pixel_yuy2, fetch_pixel_generic_64,
+      fetch_scanline_yuy2, fetch_scanline_generic_float,
+      fetch_pixel_yuy2, fetch_pixel_generic_float,
       NULL, NULL },
-    
+
     { PIXMAN_yv12,
-      fetch_scanline_yv12, fetch_scanline_generic_64,
-      fetch_pixel_yv12, fetch_pixel_generic_64,
+      fetch_scanline_yv12, fetch_scanline_generic_float,
+      fetch_pixel_yv12, fetch_pixel_generic_float,
       NULL, NULL },
     
     { PIXMAN_null },
@@ -2952,11 +1396,11 @@ setup_accessors (bits_image_t *image)
 	if (info->format == image->format)
 	{
 	    image->fetch_scanline_32 = info->fetch_scanline_32;
-	    image->fetch_scanline_64 = info->fetch_scanline_64;
+	    image->fetch_scanline_float = info->fetch_scanline_float;
 	    image->fetch_pixel_32 = info->fetch_pixel_32;
-	    image->fetch_pixel_64 = info->fetch_pixel_64;
+	    image->fetch_pixel_float = info->fetch_pixel_float;
 	    image->store_scanline_32 = info->store_scanline_32;
-	    image->store_scanline_64 = info->store_scanline_64;
+	    image->store_scanline_float = info->store_scanline_float;
 	    
 	    return;
 	}
diff --git a/programs/develop/libraries/pixman/pixman-accessor.h b/programs/develop/libraries/pixman/pixman-accessor.h
index 90c8ea7b77..8e0b03621b 100644
--- a/programs/develop/libraries/pixman/pixman-accessor.h
+++ b/programs/develop/libraries/pixman/pixman-accessor.h
@@ -1,21 +1,10 @@
 #ifdef PIXMAN_FB_ACCESSORS
 
-#define ACCESS(sym) sym##_accessors
-
 #define READ(img, ptr)							\
     (((bits_image_t *)(img))->read_func ((ptr), sizeof(*(ptr))))
 #define WRITE(img, ptr,val)						\
     (((bits_image_t *)(img))->write_func ((ptr), (val), sizeof (*(ptr))))
 
-#define MEMCPY_WRAPPED(img, dst, src, size)				\
-    do {								\
-	size_t _i;							\
-	uint8_t *_dst = (uint8_t*)(dst), *_src = (uint8_t*)(src);	\
-	for(_i = 0; _i < size; _i++) {					\
-	    WRITE((img), _dst +_i, READ((img), _src + _i));		\
-	}								\
-    } while (0)
-
 #define MEMSET_WRAPPED(img, dst, val, size)				\
     do {								\
 	size_t _i;							\
@@ -27,12 +16,8 @@
 
 #else
 
-#define ACCESS(sym) sym
-
 #define READ(img, ptr)		(*(ptr))
 #define WRITE(img, ptr, val)	(*(ptr) = (val))
-#define MEMCPY_WRAPPED(img, dst, src, size)				\
-    memcpy(dst, src, size)
 #define MEMSET_WRAPPED(img, dst, val, size)				\
     memset(dst, val, size)
 
diff --git a/programs/develop/libraries/pixman/pixman-bits-image.c b/programs/develop/libraries/pixman/pixman-bits-image.c
index b27a732848..75a39a1159 100644
--- a/programs/develop/libraries/pixman/pixman-bits-image.c
+++ b/programs/develop/libraries/pixman/pixman-bits-image.c
@@ -34,44 +34,20 @@
 #include <string.h>
 #include "pixman-private.h"
 #include "pixman-combine32.h"
+#include "pixman-inlines.h"
 
-/* Store functions */
-void
-_pixman_image_store_scanline_32 (bits_image_t *  image,
-                                 int             x,
-                                 int             y,
-                                 int             width,
-                                 const uint32_t *buffer)
+static uint32_t *
+_pixman_image_get_scanline_generic_float (pixman_iter_t * iter,
+					  const uint32_t *mask)
 {
-    image->store_scanline_32 (image, x, y, width, buffer);
+    pixman_iter_get_scanline_t fetch_32 = iter->data;
+    uint32_t *buffer = iter->buffer;
 
-    if (image->common.alpha_map)
-    {
-	x -= image->common.alpha_origin_x;
-	y -= image->common.alpha_origin_y;
+    fetch_32 (iter, NULL);
 
-	image->common.alpha_map->store_scanline_32 (
-	    image->common.alpha_map, x, y, width, buffer);
-    }
-}
+    pixman_expand_to_float ((argb_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
 
-void
-_pixman_image_store_scanline_64 (bits_image_t *  image,
-                                 int             x,
-                                 int             y,
-                                 int             width,
-                                 const uint32_t *buffer)
-{
-    image->store_scanline_64 (image, x, y, width, buffer);
-
-    if (image->common.alpha_map)
-    {
-	x -= image->common.alpha_origin_x;
-	y -= image->common.alpha_origin_y;
-
-	image->common.alpha_map->store_scanline_64 (
-	    image->common.alpha_map, x, y, width, buffer);
-    }
+    return iter->buffer;
 }
 
 /* Fetch functions */
@@ -92,34 +68,6 @@ fetch_pixel_no_alpha (bits_image_t *image,
 typedef uint32_t (* get_pixel_t) (bits_image_t *image,
 				  int x, int y, pixman_bool_t check_bounds);
 
-static force_inline void
-repeat (pixman_repeat_t repeat, int size, int *coord)
-{
-    switch (repeat)
-    {
-    case PIXMAN_REPEAT_NORMAL:
-	*coord = MOD (*coord, size);
-	break;
-
-    case PIXMAN_REPEAT_PAD:
-	*coord = CLIP (*coord, 0, size - 1);
-	break;
-
-    case PIXMAN_REPEAT_REFLECT:
-	*coord = MOD (*coord, size * 2);
-
-	if (*coord >= size)
-	    *coord = size * 2 - *coord - 1;
-	break;
-
-    case PIXMAN_REPEAT_NONE:
-	break;
-
-    default:
-        break;
-    }
-}
-
 static force_inline uint32_t
 bits_image_fetch_pixel_nearest (bits_image_t   *image,
 				pixman_fixed_t  x,
@@ -131,8 +79,8 @@ bits_image_fetch_pixel_nearest (bits_image_t   *image,
 
     if (image->common.repeat != PIXMAN_REPEAT_NONE)
     {
-	repeat (image->common.repeat, image->width, &x0);
-	repeat (image->common.repeat, image->height, &y0);
+	repeat (image->common.repeat, &x0, image->width);
+	repeat (image->common.repeat, &y0, image->height);
 
 	return get_pixel (image, x0, y0, FALSE);
     }
@@ -142,97 +90,6 @@ bits_image_fetch_pixel_nearest (bits_image_t   *image,
     }
 }
 
-#if SIZEOF_LONG > 4
-
-static force_inline uint32_t
-bilinear_interpolation (uint32_t tl, uint32_t tr,
-			uint32_t bl, uint32_t br,
-			int distx, int disty)
-{
-    uint64_t distxy, distxiy, distixy, distixiy;
-    uint64_t tl64, tr64, bl64, br64;
-    uint64_t f, r;
-
-    distxy = distx * disty;
-    distxiy = distx * (256 - disty);
-    distixy = (256 - distx) * disty;
-    distixiy = (256 - distx) * (256 - disty);
-
-    /* Alpha and Blue */
-    tl64 = tl & 0xff0000ff;
-    tr64 = tr & 0xff0000ff;
-    bl64 = bl & 0xff0000ff;
-    br64 = br & 0xff0000ff;
-
-    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
-    r = f & 0x0000ff0000ff0000ull;
-
-    /* Red and Green */
-    tl64 = tl;
-    tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
-
-    tr64 = tr;
-    tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
-
-    bl64 = bl;
-    bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
-
-    br64 = br;
-    br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
-
-    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
-    r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
-
-    return (uint32_t)(r >> 16);
-}
-
-#else
-
-static force_inline uint32_t
-bilinear_interpolation (uint32_t tl, uint32_t tr,
-			uint32_t bl, uint32_t br,
-			int distx, int disty)
-{
-    int distxy, distxiy, distixy, distixiy;
-    uint32_t f, r;
-
-    distxy = distx * disty;
-    distxiy = (distx << 8) - distxy;	/* distx * (256 - disty) */
-    distixy = (disty << 8) - distxy;	/* disty * (256 - distx) */
-    distixiy =
-	256 * 256 - (disty << 8) -
-	(distx << 8) + distxy;		/* (256 - distx) * (256 - disty) */
-
-    /* Blue */
-    r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
-      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
-
-    /* Green */
-    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
-      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
-    r |= f & 0xff000000;
-
-    tl >>= 16;
-    tr >>= 16;
-    bl >>= 16;
-    br >>= 16;
-    r >>= 16;
-
-    /* Red */
-    f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
-      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
-    r |= f & 0x00ff0000;
-
-    /* Alpha */
-    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
-      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
-    r |= f & 0xff000000;
-
-    return r;
-}
-
-#endif
-
 static force_inline uint32_t
 bits_image_fetch_pixel_bilinear (bits_image_t   *image,
 				 pixman_fixed_t  x,
@@ -249,8 +106,8 @@ bits_image_fetch_pixel_bilinear (bits_image_t   *image,
     x1 = x - pixman_fixed_1 / 2;
     y1 = y - pixman_fixed_1 / 2;
 
-    distx = (x1 >> 8) & 0xff;
-    disty = (y1 >> 8) & 0xff;
+    distx = pixman_fixed_to_bilinear_weight (x1);
+    disty = pixman_fixed_to_bilinear_weight (y1);
 
     x1 = pixman_fixed_to_int (x1);
     y1 = pixman_fixed_to_int (y1);
@@ -259,10 +116,10 @@ bits_image_fetch_pixel_bilinear (bits_image_t   *image,
 
     if (repeat_mode != PIXMAN_REPEAT_NONE)
     {
-	repeat (repeat_mode, width, &x1);
-	repeat (repeat_mode, height, &y1);
-	repeat (repeat_mode, width, &x2);
-	repeat (repeat_mode, height, &y2);
+	repeat (repeat_mode, &x1, width);
+	repeat (repeat_mode, &y1, height);
+	repeat (repeat_mode, &x2, width);
+	repeat (repeat_mode, &y2, height);
 
 	tl = get_pixel (image, x1, y1, FALSE);
 	bl = get_pixel (image, x1, y2, FALSE);
@@ -280,14 +137,17 @@ bits_image_fetch_pixel_bilinear (bits_image_t   *image,
     return bilinear_interpolation (tl, tr, bl, br, distx, disty);
 }
 
-static void
-bits_image_fetch_bilinear_no_repeat_8888 (pixman_image_t * ima,
-					  int              offset,
-					  int              line,
-					  int              width,
-					  uint32_t *       buffer,
-					  const uint32_t * mask)
+static uint32_t *
+bits_image_fetch_bilinear_no_repeat_8888 (pixman_iter_t *iter,
+					  const uint32_t *mask)
 {
+
+    pixman_image_t * ima = iter->image;
+    int              offset = iter->x;
+    int              line = iter->y++;
+    int              width = iter->width;
+    uint32_t *       buffer = iter->buffer;
+
     bits_image_t *bits = &ima->bits;
     pixman_fixed_t x_top, x_bottom, x;
     pixman_fixed_t ux_top, ux_bottom, ux;
@@ -309,13 +169,13 @@ bits_image_fetch_bilinear_no_repeat_8888 (pixman_image_t * ima,
     v.vector[2] = pixman_fixed_1;
 
     if (!pixman_transform_point_3d (bits->common.transform, &v))
-	return;
+	return iter->buffer;
 
     ux = ux_top = ux_bottom = bits->common.transform->matrix[0][0];
     x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2;
 
     y = v.vector[1] - pixman_fixed_1/2;
-    disty = (y >> 8) & 0xff;
+    disty = pixman_fixed_to_bilinear_weight (y);
 
     /* Load the pointers to the first and second lines from the source
      * image that bilinear code must read.
@@ -376,7 +236,7 @@ bits_image_fetch_bilinear_no_repeat_8888 (pixman_image_t * ima,
     if (top_row == zero && bottom_row == zero)
     {
 	memset (buffer, 0, width * sizeof (uint32_t));
-	return;
+	return iter->buffer;
     }
     else if (bits->format == PIXMAN_x8r8g8b8)
     {
@@ -424,7 +284,7 @@ bits_image_fetch_bilinear_no_repeat_8888 (pixman_image_t * ima,
 	tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask;
 	br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
 
-	distx = (x >> 8) & 0xff;
+	distx = pixman_fixed_to_bilinear_weight (x);
 
 	*buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty);
 
@@ -449,7 +309,7 @@ bits_image_fetch_bilinear_no_repeat_8888 (pixman_image_t * ima,
 	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
 	    br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
 
-	    distx = (x >> 8) & 0xff;
+	    distx = pixman_fixed_to_bilinear_weight (x);
 
 	    *buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty);
 	}
@@ -473,7 +333,7 @@ bits_image_fetch_bilinear_no_repeat_8888 (pixman_image_t * ima,
 	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
 	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
 
-	    distx = (x >> 8) & 0xff;
+	    distx = pixman_fixed_to_bilinear_weight (x);
 
 	    *buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty);
 	}
@@ -488,6 +348,8 @@ bits_image_fetch_bilinear_no_repeat_8888 (pixman_image_t * ima,
     /* Zero fill to the left of the image */
     while (buffer < end)
 	*buffer++ = 0;
+
+    return iter->buffer;
 }
 
 static force_inline uint32_t
@@ -501,11 +363,11 @@ bits_image_fetch_pixel_convolution (bits_image_t   *image,
     int y_off = (params[1] - pixman_fixed_1) >> 1;
     int32_t cwidth = pixman_fixed_to_int (params[0]);
     int32_t cheight = pixman_fixed_to_int (params[1]);
-    int32_t srtot, sgtot, sbtot, satot;
     int32_t i, j, x1, x2, y1, y2;
     pixman_repeat_t repeat_mode = image->common.repeat;
     int width = image->width;
     int height = image->height;
+    int srtot, sgtot, sbtot, satot;
 
     params += 2;
 
@@ -531,8 +393,8 @@ bits_image_fetch_pixel_convolution (bits_image_t   *image,
 
 		if (repeat_mode != PIXMAN_REPEAT_NONE)
 		{
-		    repeat (repeat_mode, width, &rx);
-		    repeat (repeat_mode, height, &ry);
+		    repeat (repeat_mode, &rx, width);
+		    repeat (repeat_mode, &ry, height);
 
 		    pixel = get_pixel (image, rx, ry, FALSE);
 		}
@@ -541,20 +403,118 @@ bits_image_fetch_pixel_convolution (bits_image_t   *image,
 		    pixel = get_pixel (image, rx, ry, TRUE);
 		}
 
-		srtot += RED_8 (pixel) * f;
-		sgtot += GREEN_8 (pixel) * f;
-		sbtot += BLUE_8 (pixel) * f;
-		satot += ALPHA_8 (pixel) * f;
+		srtot += (int)RED_8 (pixel) * f;
+		sgtot += (int)GREEN_8 (pixel) * f;
+		sbtot += (int)BLUE_8 (pixel) * f;
+		satot += (int)ALPHA_8 (pixel) * f;
 	    }
 
 	    params++;
 	}
     }
 
-    satot >>= 16;
-    srtot >>= 16;
-    sgtot >>= 16;
-    sbtot >>= 16;
+    satot = (satot + 0x8000) >> 16;
+    srtot = (srtot + 0x8000) >> 16;
+    sgtot = (sgtot + 0x8000) >> 16;
+    sbtot = (sbtot + 0x8000) >> 16;
+
+    satot = CLIP (satot, 0, 0xff);
+    srtot = CLIP (srtot, 0, 0xff);
+    sgtot = CLIP (sgtot, 0, 0xff);
+    sbtot = CLIP (sbtot, 0, 0xff);
+
+    return ((satot << 24) | (srtot << 16) | (sgtot <<  8) | (sbtot));
+}
+
+static uint32_t
+bits_image_fetch_pixel_separable_convolution (bits_image_t *image,
+                                              pixman_fixed_t x,
+                                              pixman_fixed_t y,
+                                              get_pixel_t    get_pixel)
+{
+    pixman_fixed_t *params = image->common.filter_params;
+    pixman_repeat_t repeat_mode = image->common.repeat;
+    int width = image->width;
+    int height = image->height;
+    int cwidth = pixman_fixed_to_int (params[0]);
+    int cheight = pixman_fixed_to_int (params[1]);
+    int x_phase_bits = pixman_fixed_to_int (params[2]);
+    int y_phase_bits = pixman_fixed_to_int (params[3]);
+    int x_phase_shift = 16 - x_phase_bits;
+    int y_phase_shift = 16 - y_phase_bits;
+    int x_off = ((cwidth << 16) - pixman_fixed_1) >> 1;
+    int y_off = ((cheight << 16) - pixman_fixed_1) >> 1;
+    pixman_fixed_t *y_params;
+    int srtot, sgtot, sbtot, satot;
+    int32_t x1, x2, y1, y2;
+    int32_t px, py;
+    int i, j;
+
+    /* Round x and y to the middle of the closest phase before continuing. This
+     * ensures that the convolution matrix is aligned right, since it was
+     * positioned relative to a particular phase (and not relative to whatever
+     * exact fraction we happen to get here).
+     */
+    x = ((x >> x_phase_shift) << x_phase_shift) + ((1 << x_phase_shift) >> 1);
+    y = ((y >> y_phase_shift) << y_phase_shift) + ((1 << y_phase_shift) >> 1);
+
+    px = (x & 0xffff) >> x_phase_shift;
+    py = (y & 0xffff) >> y_phase_shift;
+
+    y_params = params + 4 + (1 << x_phase_bits) * cwidth + py * cheight;
+
+    x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off);
+    y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off);
+    x2 = x1 + cwidth;
+    y2 = y1 + cheight;
+
+    srtot = sgtot = sbtot = satot = 0;
+
+    for (i = y1; i < y2; ++i)
+    {
+        pixman_fixed_48_16_t fy = *y_params++;
+        pixman_fixed_t *x_params = params + 4 + px * cwidth;
+
+        if (fy)
+        {
+            for (j = x1; j < x2; ++j)
+            {
+                pixman_fixed_t fx = *x_params++;
+		int rx = j;
+		int ry = i;
+
+                if (fx)
+                {
+                    pixman_fixed_t f;
+                    uint32_t pixel;
+
+                    if (repeat_mode != PIXMAN_REPEAT_NONE)
+                    {
+                        repeat (repeat_mode, &rx, width);
+                        repeat (repeat_mode, &ry, height);
+
+                        pixel = get_pixel (image, rx, ry, FALSE);
+                    }
+                    else
+                    {
+                        pixel = get_pixel (image, rx, ry, TRUE);
+		    }
+
+                    f = (fy * fx + 0x8000) >> 16;
+
+                    srtot += (int)RED_8 (pixel) * f;
+                    sgtot += (int)GREEN_8 (pixel) * f;
+                    sbtot += (int)BLUE_8 (pixel) * f;
+                    satot += (int)ALPHA_8 (pixel) * f;
+                }
+            }
+	}
+    }
+
+    satot = (satot + 0x8000) >> 16;
+    srtot = (srtot + 0x8000) >> 16;
+    sgtot = (sgtot + 0x8000) >> 16;
+    sbtot = (sbtot + 0x8000) >> 16;
 
     satot = CLIP (satot, 0, 0xff);
     srtot = CLIP (srtot, 0, 0xff);
@@ -587,6 +547,10 @@ bits_image_fetch_pixel_filtered (bits_image_t *image,
 	return bits_image_fetch_pixel_convolution (image, x, y, get_pixel);
 	break;
 
+    case PIXMAN_FILTER_SEPARABLE_CONVOLUTION:
+        return bits_image_fetch_pixel_separable_convolution (image, x, y, get_pixel);
+        break;
+
     default:
         break;
     }
@@ -594,14 +558,16 @@ bits_image_fetch_pixel_filtered (bits_image_t *image,
     return 0;
 }
 
-static void
-bits_image_fetch_affine_no_alpha (pixman_image_t * image,
-				  int              offset,
-				  int              line,
-				  int              width,
-				  uint32_t *       buffer,
+static uint32_t *
+bits_image_fetch_affine_no_alpha (pixman_iter_t *  iter,
 				  const uint32_t * mask)
 {
+    pixman_image_t *image  = iter->image;
+    int             offset = iter->x;
+    int             line   = iter->y++;
+    int             width  = iter->width;
+    uint32_t *      buffer = iter->buffer;
+
     pixman_fixed_t x, y;
     pixman_fixed_t ux, uy;
     pixman_vector_t v;
@@ -615,7 +581,7 @@ bits_image_fetch_affine_no_alpha (pixman_image_t * image,
     if (image->common.transform)
     {
 	if (!pixman_transform_point_3d (image->common.transform, &v))
-	    return;
+	    return iter->buffer;
 
 	ux = image->common.transform->matrix[0][0];
 	uy = image->common.transform->matrix[1][0];
@@ -640,6 +606,8 @@ bits_image_fetch_affine_no_alpha (pixman_image_t * image,
 	x += ux;
 	y += uy;
     }
+
+    return buffer;
 }
 
 /* General fetcher */
@@ -683,14 +651,16 @@ fetch_pixel_general (bits_image_t *image, int x, int y, pixman_bool_t check_boun
     return pixel;
 }
 
-static void
-bits_image_fetch_general (pixman_image_t * image,
-			  int              offset,
-			  int              line,
-			  int              width,
-			  uint32_t *       buffer,
-			  const uint32_t * mask)
+static uint32_t *
+bits_image_fetch_general (pixman_iter_t  *iter,
+			  const uint32_t *mask)
 {
+    pixman_image_t *image  = iter->image;
+    int             offset = iter->x;
+    int             line   = iter->y++;
+    int             width  = iter->width;
+    uint32_t *      buffer = iter->buffer;
+
     pixman_fixed_t x, y, w;
     pixman_fixed_t ux, uy, uw;
     pixman_vector_t v;
@@ -704,7 +674,7 @@ bits_image_fetch_general (pixman_image_t * image,
     if (image->common.transform)
     {
 	if (!pixman_transform_point_3d (image->common.transform, &v))
-	    return;
+	    return buffer;
 
 	ux = image->common.transform->matrix[0][0];
 	uy = image->common.transform->matrix[1][0];
@@ -746,12 +716,158 @@ bits_image_fetch_general (pixman_image_t * image,
 	y += uy;
 	w += uw;
     }
+
+    return buffer;
+}
+
+typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x);
+
+static force_inline void
+bits_image_fetch_separable_convolution_affine (pixman_image_t * image,
+					       int              offset,
+					       int              line,
+					       int              width,
+					       uint32_t *       buffer,
+					       const uint32_t * mask,
+
+					       convert_pixel_t	convert_pixel,
+					       pixman_format_code_t	format,
+					       pixman_repeat_t	repeat_mode)
+{
+    bits_image_t *bits = &image->bits;
+    pixman_fixed_t *params = image->common.filter_params;
+    int cwidth = pixman_fixed_to_int (params[0]);
+    int cheight = pixman_fixed_to_int (params[1]);
+    int x_off = ((cwidth << 16) - pixman_fixed_1) >> 1;
+    int y_off = ((cheight << 16) - pixman_fixed_1) >> 1;
+    int x_phase_bits = pixman_fixed_to_int (params[2]);
+    int y_phase_bits = pixman_fixed_to_int (params[3]);
+    int x_phase_shift = 16 - x_phase_bits;
+    int y_phase_shift = 16 - y_phase_bits;
+    pixman_fixed_t vx, vy;
+    pixman_fixed_t ux, uy;
+    pixman_vector_t v;
+    int k;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (image->common.transform, &v))
+	return;
+
+    ux = image->common.transform->matrix[0][0];
+    uy = image->common.transform->matrix[1][0];
+
+    vx = v.vector[0];
+    vy = v.vector[1];
+
+    for (k = 0; k < width; ++k)
+    {
+	pixman_fixed_t *y_params;
+	int satot, srtot, sgtot, sbtot;
+	pixman_fixed_t x, y;
+	int32_t x1, x2, y1, y2;
+	int32_t px, py;
+	int i, j;
+
+	if (mask && !mask[k])
+	    goto next;
+
+	/* Round x and y to the middle of the closest phase before continuing. This
+	 * ensures that the convolution matrix is aligned right, since it was
+	 * positioned relative to a particular phase (and not relative to whatever
+	 * exact fraction we happen to get here).
+	 */
+	x = ((vx >> x_phase_shift) << x_phase_shift) + ((1 << x_phase_shift) >> 1);
+	y = ((vy >> y_phase_shift) << y_phase_shift) + ((1 << y_phase_shift) >> 1);
+
+	px = (x & 0xffff) >> x_phase_shift;
+	py = (y & 0xffff) >> y_phase_shift;
+
+	x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off);
+	y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off);
+	x2 = x1 + cwidth;
+	y2 = y1 + cheight;
+
+	satot = srtot = sgtot = sbtot = 0;
+
+	y_params = params + 4 + (1 << x_phase_bits) * cwidth + py * cheight;
+
+	for (i = y1; i < y2; ++i)
+	{
+	    pixman_fixed_t fy = *y_params++;
+
+	    if (fy)
+	    {
+		pixman_fixed_t *x_params = params + 4 + px * cwidth;
+
+		for (j = x1; j < x2; ++j)
+		{
+		    pixman_fixed_t fx = *x_params++;
+		    int rx = j;
+		    int ry = i;
+		    
+		    if (fx)
+		    {
+			pixman_fixed_t f;
+			uint32_t pixel, mask;
+			uint8_t *row;
+
+			mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+
+			if (repeat_mode != PIXMAN_REPEAT_NONE)
+			{
+			    repeat (repeat_mode, &rx, bits->width);
+			    repeat (repeat_mode, &ry, bits->height);
+
+			    row = (uint8_t *)bits->bits + bits->rowstride * 4 * ry;
+			    pixel = convert_pixel (row, rx) | mask;
+			}
+			else
+			{
+			    if (rx < 0 || ry < 0 || rx >= bits->width || ry >= bits->height)
+			    {
+				pixel = 0;
+			    }
+			    else
+			    {
+				row = (uint8_t *)bits->bits + bits->rowstride * 4 * ry;
+				pixel = convert_pixel (row, rx) | mask;
+			    }
+			}
+
+			f = ((pixman_fixed_32_32_t)fx * fy + 0x8000) >> 16;
+			srtot += (int)RED_8 (pixel) * f;
+			sgtot += (int)GREEN_8 (pixel) * f;
+			sbtot += (int)BLUE_8 (pixel) * f;
+			satot += (int)ALPHA_8 (pixel) * f;
+		    }
+		}
+	    }
+	}
+
+	satot = (satot + 0x8000) >> 16;
+	srtot = (srtot + 0x8000) >> 16;
+	sgtot = (sgtot + 0x8000) >> 16;
+	sbtot = (sbtot + 0x8000) >> 16;
+
+	satot = CLIP (satot, 0, 0xff);
+	srtot = CLIP (srtot, 0, 0xff);
+	sgtot = CLIP (sgtot, 0, 0xff);
+	sbtot = CLIP (sbtot, 0, 0xff);
+
+	buffer[k] = (satot << 24) | (srtot << 16) | (sgtot << 8) | (sbtot << 0);
+
+    next:
+	vx += ux;
+	vy += uy;
+    }
 }
 
 static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
 
-typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x);
-
 static force_inline void
 bits_image_fetch_bilinear_affine (pixman_image_t * image,
 				  int              offset,
@@ -800,8 +916,8 @@ bits_image_fetch_bilinear_affine (pixman_image_t * image,
 	x1 = x - pixman_fixed_1 / 2;
 	y1 = y - pixman_fixed_1 / 2;
 
-	distx = (x1 >> 8) & 0xff;
-	disty = (y1 >> 8) & 0xff;
+	distx = pixman_fixed_to_bilinear_weight (x1);
+	disty = pixman_fixed_to_bilinear_weight (y1);
 
 	y1 = pixman_fixed_to_int (y1);
 	y2 = y1 + 1;
@@ -814,10 +930,10 @@ bits_image_fetch_bilinear_affine (pixman_image_t * image,
 
 	    mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
 
-	    repeat (repeat_mode, width, &x1);
-	    repeat (repeat_mode, height, &y1);
-	    repeat (repeat_mode, width, &x2);
-	    repeat (repeat_mode, height, &y2);
+	    repeat (repeat_mode, &x1, width);
+	    repeat (repeat_mode, &y1, height);
+	    repeat (repeat_mode, &x2, width);
+	    repeat (repeat_mode, &y2, height);
 
 	    row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
 	    row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
@@ -906,6 +1022,77 @@ bits_image_fetch_bilinear_affine (pixman_image_t * image,
     }
 }
 
+static force_inline void
+bits_image_fetch_nearest_affine (pixman_image_t * image,
+				 int              offset,
+				 int              line,
+				 int              width,
+				 uint32_t *       buffer,
+				 const uint32_t * mask,
+				 
+				 convert_pixel_t	convert_pixel,
+				 pixman_format_code_t	format,
+				 pixman_repeat_t	repeat_mode)
+{
+    pixman_fixed_t x, y;
+    pixman_fixed_t ux, uy;
+    pixman_vector_t v;
+    bits_image_t *bits = &image->bits;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (image->common.transform, &v))
+	return;
+
+    ux = image->common.transform->matrix[0][0];
+    uy = image->common.transform->matrix[1][0];
+
+    x = v.vector[0];
+    y = v.vector[1];
+
+    for (i = 0; i < width; ++i)
+    {
+	int width, height, x0, y0;
+	const uint8_t *row;
+
+	if (mask && !mask[i])
+	    goto next;
+	
+	width = image->bits.width;
+	height = image->bits.height;
+	x0 = pixman_fixed_to_int (x - pixman_fixed_e);
+	y0 = pixman_fixed_to_int (y - pixman_fixed_e);
+
+	if (repeat_mode == PIXMAN_REPEAT_NONE &&
+	    (y0 < 0 || y0 >= height || x0 < 0 || x0 >= width))
+	{
+	    buffer[i] = 0;
+	}
+	else
+	{
+	    uint32_t mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+
+	    if (repeat_mode != PIXMAN_REPEAT_NONE)
+	    {
+		repeat (repeat_mode, &x0, width);
+		repeat (repeat_mode, &y0, height);
+	    }
+
+	    row = (uint8_t *)bits->bits + bits->rowstride * 4 * y0;
+
+	    buffer[i] = convert_pixel (row, x0) | mask;
+	}
+
+    next:
+	x += ux;
+	y += uy;
+    }
+}
+
 static force_inline uint32_t
 convert_a8r8g8b8 (const uint8_t *row, int x)
 {
@@ -927,54 +1114,89 @@ convert_a8 (const uint8_t *row, int x)
 static force_inline uint32_t
 convert_r5g6b5 (const uint8_t *row, int x)
 {
-    return CONVERT_0565_TO_0888 (*((uint16_t *)row + x));
+    return convert_0565_to_0888 (*((uint16_t *)row + x));
 }
 
+#define MAKE_SEPARABLE_CONVOLUTION_FETCHER(name, format, repeat_mode)  \
+    static uint32_t *							\
+    bits_image_fetch_separable_convolution_affine_ ## name (pixman_iter_t   *iter, \
+							    const uint32_t * mask) \
+    {									\
+	bits_image_fetch_separable_convolution_affine (                 \
+	    iter->image,                                                \
+	    iter->x, iter->y++,                                         \
+	    iter->width,                                                \
+	    iter->buffer, mask,                                         \
+	    convert_ ## format,                                         \
+	    PIXMAN_ ## format,                                          \
+	    repeat_mode);                                               \
+									\
+	return iter->buffer;                                            \
+    }
+
 #define MAKE_BILINEAR_FETCHER(name, format, repeat_mode)		\
-    static void								\
-    bits_image_fetch_bilinear_affine_ ## name (pixman_image_t *image,	\
-					       int              offset,	\
-					       int              line,	\
-					       int              width,	\
-					       uint32_t *       buffer,	\
+    static uint32_t *							\
+    bits_image_fetch_bilinear_affine_ ## name (pixman_iter_t   *iter,	\
 					       const uint32_t * mask)	\
     {									\
-	bits_image_fetch_bilinear_affine (image, offset, line, width, buffer, mask, \
+	bits_image_fetch_bilinear_affine (iter->image,			\
+					  iter->x, iter->y++,		\
+					  iter->width,			\
+					  iter->buffer, mask,		\
 					  convert_ ## format,		\
 					  PIXMAN_ ## format,		\
 					  repeat_mode);			\
-    }									\
-    extern int no_such_variable
+	return iter->buffer;						\
+    }
 
-MAKE_BILINEAR_FETCHER (pad_a8r8g8b8,     a8r8g8b8, PIXMAN_REPEAT_PAD);
-MAKE_BILINEAR_FETCHER (none_a8r8g8b8,    a8r8g8b8, PIXMAN_REPEAT_NONE);
-MAKE_BILINEAR_FETCHER (reflect_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_REFLECT);
-MAKE_BILINEAR_FETCHER (normal_a8r8g8b8,  a8r8g8b8, PIXMAN_REPEAT_NORMAL);
-MAKE_BILINEAR_FETCHER (pad_x8r8g8b8,     x8r8g8b8, PIXMAN_REPEAT_PAD);
-MAKE_BILINEAR_FETCHER (none_x8r8g8b8,    x8r8g8b8, PIXMAN_REPEAT_NONE);
-MAKE_BILINEAR_FETCHER (reflect_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_REFLECT);
-MAKE_BILINEAR_FETCHER (normal_x8r8g8b8,  x8r8g8b8, PIXMAN_REPEAT_NORMAL);
-MAKE_BILINEAR_FETCHER (pad_a8,           a8,       PIXMAN_REPEAT_PAD);
-MAKE_BILINEAR_FETCHER (none_a8,          a8,       PIXMAN_REPEAT_NONE);
-MAKE_BILINEAR_FETCHER (reflect_a8,	 a8,       PIXMAN_REPEAT_REFLECT);
-MAKE_BILINEAR_FETCHER (normal_a8,	 a8,       PIXMAN_REPEAT_NORMAL);
-MAKE_BILINEAR_FETCHER (pad_r5g6b5,       r5g6b5,   PIXMAN_REPEAT_PAD);
-MAKE_BILINEAR_FETCHER (none_r5g6b5,      r5g6b5,   PIXMAN_REPEAT_NONE);
-MAKE_BILINEAR_FETCHER (reflect_r5g6b5,   r5g6b5,   PIXMAN_REPEAT_REFLECT);
-MAKE_BILINEAR_FETCHER (normal_r5g6b5,    r5g6b5,   PIXMAN_REPEAT_NORMAL);
+#define MAKE_NEAREST_FETCHER(name, format, repeat_mode)			\
+    static uint32_t *							\
+    bits_image_fetch_nearest_affine_ ## name (pixman_iter_t   *iter,	\
+					      const uint32_t * mask)	\
+    {									\
+	bits_image_fetch_nearest_affine (iter->image,			\
+					 iter->x, iter->y++,		\
+					 iter->width,			\
+					 iter->buffer, mask,		\
+					 convert_ ## format,		\
+					 PIXMAN_ ## format,		\
+					 repeat_mode);			\
+	return iter->buffer;						\
+    }
+
+#define MAKE_FETCHERS(name, format, repeat_mode)			\
+    MAKE_NEAREST_FETCHER (name, format, repeat_mode)			\
+    MAKE_BILINEAR_FETCHER (name, format, repeat_mode)			\
+    MAKE_SEPARABLE_CONVOLUTION_FETCHER (name, format, repeat_mode)
+
+MAKE_FETCHERS (pad_a8r8g8b8,     a8r8g8b8, PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_a8r8g8b8,    a8r8g8b8, PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_a8r8g8b8,  a8r8g8b8, PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_x8r8g8b8,     x8r8g8b8, PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_x8r8g8b8,    x8r8g8b8, PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_x8r8g8b8,  x8r8g8b8, PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_a8,           a8,       PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_a8,          a8,       PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_a8,	 a8,       PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_a8,	 a8,       PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_r5g6b5,       r5g6b5,   PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_r5g6b5,      r5g6b5,   PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_r5g6b5,   r5g6b5,   PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_r5g6b5,    r5g6b5,   PIXMAN_REPEAT_NORMAL)
 
 static void
-bits_image_fetch_solid_32 (pixman_image_t * image,
-                           int              x,
-                           int              y,
-                           int              width,
-                           uint32_t *       buffer,
-                           const uint32_t * mask)
+replicate_pixel_32 (bits_image_t *   bits,
+		    int              x,
+		    int              y,
+		    int              width,
+		    uint32_t *       buffer)
 {
     uint32_t color;
     uint32_t *end;
 
-    color = image->bits.fetch_pixel_32 (&image->bits, 0, 0);
+    color = bits->fetch_pixel_32 (bits, x, y);
 
     end = buffer + width;
     while (buffer < end)
@@ -982,18 +1204,17 @@ bits_image_fetch_solid_32 (pixman_image_t * image,
 }
 
 static void
-bits_image_fetch_solid_64 (pixman_image_t * image,
-                           int              x,
-                           int              y,
-                           int              width,
-                           uint32_t *       b,
-                           const uint32_t * unused)
+replicate_pixel_float (bits_image_t *   bits,
+		       int              x,
+		       int              y,
+		       int              width,
+		       uint32_t *       b)
 {
-    uint64_t color;
-    uint64_t *buffer = (uint64_t *)b;
-    uint64_t *end;
+    argb_t color;
+    argb_t *buffer = (argb_t *)b;
+    argb_t *end;
 
-    color = image->bits.fetch_pixel_64 (&image->bits, 0, 0);
+    color = bits->fetch_pixel_float (bits, x, y);
 
     end = buffer + width;
     while (buffer < end)
@@ -1012,7 +1233,7 @@ bits_image_fetch_untransformed_repeat_none (bits_image_t *image,
 
     if (y < 0 || y >= image->height)
     {
-	memset (buffer, 0, width * (wide? 8 : 4));
+	memset (buffer, 0, width * (wide? sizeof (argb_t) : 4));
 	return;
     }
 
@@ -1020,10 +1241,10 @@ bits_image_fetch_untransformed_repeat_none (bits_image_t *image,
     {
 	w = MIN (width, -x);
 
-	memset (buffer, 0, w * (wide ? 8 : 4));
+	memset (buffer, 0, w * (wide ? sizeof (argb_t) : 4));
 
 	width -= w;
-	buffer += w * (wide? 2 : 1);
+	buffer += w * (wide? 4 : 1);
 	x += w;
     }
 
@@ -1032,16 +1253,16 @@ bits_image_fetch_untransformed_repeat_none (bits_image_t *image,
 	w = MIN (width, image->width - x);
 
 	if (wide)
-	    image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+	    image->fetch_scanline_float ((pixman_image_t *)image, x, y, w, buffer, NULL);
 	else
 	    image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
 
 	width -= w;
-	buffer += w * (wide? 2 : 1);
+	buffer += w * (wide? 4 : 1);
 	x += w;
     }
 
-    memset (buffer, 0, width * (wide ? 8 : 4));
+    memset (buffer, 0, width * (wide ? sizeof (argb_t) : 4));
 }
 
 static void
@@ -1060,6 +1281,16 @@ bits_image_fetch_untransformed_repeat_normal (bits_image_t *image,
     while (y >= image->height)
 	y -= image->height;
 
+    if (image->width == 1)
+    {
+	if (wide)
+	    replicate_pixel_float (image, 0, y, width, buffer);
+	else
+	    replicate_pixel_32 (image, 0, y, width, buffer);
+
+	return;
+    }
+
     while (width)
     {
 	while (x < 0)
@@ -1070,24 +1301,26 @@ bits_image_fetch_untransformed_repeat_normal (bits_image_t *image,
 	w = MIN (width, image->width - x);
 
 	if (wide)
-	    image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+	    image->fetch_scanline_float ((pixman_image_t *)image, x, y, w, buffer, NULL);
 	else
 	    image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
 
-	buffer += w * (wide? 2 : 1);
+	buffer += w * (wide? 4 : 1);
 	x += w;
 	width -= w;
     }
 }
 
-static void
-bits_image_fetch_untransformed_32 (pixman_image_t * image,
-                                   int              x,
-                                   int              y,
-                                   int              width,
-                                   uint32_t *       buffer,
-                                   const uint32_t * mask)
+static uint32_t *
+bits_image_fetch_untransformed_32 (pixman_iter_t * iter,
+				   const uint32_t *mask)
 {
+    pixman_image_t *image  = iter->image;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    uint32_t *      buffer = iter->buffer;
+
     if (image->common.repeat == PIXMAN_REPEAT_NONE)
     {
 	bits_image_fetch_untransformed_repeat_none (
@@ -1098,16 +1331,21 @@ bits_image_fetch_untransformed_32 (pixman_image_t * image,
 	bits_image_fetch_untransformed_repeat_normal (
 	    &image->bits, FALSE, x, y, width, buffer);
     }
+
+    iter->y++;
+    return buffer;
 }
 
-static void
-bits_image_fetch_untransformed_64 (pixman_image_t * image,
-                                   int              x,
-                                   int              y,
-                                   int              width,
-                                   uint32_t *       buffer,
-                                   const uint32_t * unused)
+static uint32_t *
+bits_image_fetch_untransformed_float (pixman_iter_t * iter,
+				      const uint32_t *mask)
 {
+    pixman_image_t *image  = iter->image;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    uint32_t *      buffer = iter->buffer;
+
     if (image->common.repeat == PIXMAN_REPEAT_NONE)
     {
 	bits_image_fetch_untransformed_repeat_none (
@@ -1118,24 +1356,21 @@ bits_image_fetch_untransformed_64 (pixman_image_t * image,
 	bits_image_fetch_untransformed_repeat_normal (
 	    &image->bits, TRUE, x, y, width, buffer);
     }
+
+    iter->y++;
+    return buffer;
 }
 
 typedef struct
 {
     pixman_format_code_t	format;
     uint32_t			flags;
-    fetch_scanline_t		fetch_32;
-    fetch_scanline_t		fetch_64;
+    pixman_iter_get_scanline_t	get_scanline_32;
+    pixman_iter_get_scanline_t  get_scanline_float;
 } fetcher_info_t;
 
 static const fetcher_info_t fetcher_info[] =
 {
-    { PIXMAN_solid,
-      FAST_PATH_NO_ALPHA_MAP,
-      bits_image_fetch_solid_32,
-      bits_image_fetch_solid_64
-    },
-
     { PIXMAN_any,
       (FAST_PATH_NO_ALPHA_MAP			|
        FAST_PATH_ID_TRANSFORM			|
@@ -1143,7 +1378,7 @@ static const fetcher_info_t fetcher_info[] =
        FAST_PATH_NO_PAD_REPEAT			|
        FAST_PATH_NO_REFLECT_REPEAT),
       bits_image_fetch_untransformed_32,
-      bits_image_fetch_untransformed_64
+      bits_image_fetch_untransformed_float
     },
 
 #define FAST_BILINEAR_FLAGS						\
@@ -1159,13 +1394,13 @@ static const fetcher_info_t fetcher_info[] =
     { PIXMAN_a8r8g8b8,
       FAST_BILINEAR_FLAGS,
       bits_image_fetch_bilinear_no_repeat_8888,
-      _pixman_image_get_scanline_generic_64
+      _pixman_image_get_scanline_generic_float
     },
 
     { PIXMAN_x8r8g8b8,
       FAST_BILINEAR_FLAGS,
       bits_image_fetch_bilinear_no_repeat_8888,
-      _pixman_image_get_scanline_generic_64
+      _pixman_image_get_scanline_generic_float
     },
 
 #define GENERAL_BILINEAR_FLAGS						\
@@ -1175,39 +1410,76 @@ static const fetcher_info_t fetcher_info[] =
      FAST_PATH_AFFINE_TRANSFORM		|				\
      FAST_PATH_BILINEAR_FILTER)
 
+#define GENERAL_NEAREST_FLAGS						\
+    (FAST_PATH_NO_ALPHA_MAP		|				\
+     FAST_PATH_NO_ACCESSORS		|				\
+     FAST_PATH_HAS_TRANSFORM		|				\
+     FAST_PATH_AFFINE_TRANSFORM		|				\
+     FAST_PATH_NEAREST_FILTER)
+
+#define GENERAL_SEPARABLE_CONVOLUTION_FLAGS				\
+    (FAST_PATH_NO_ALPHA_MAP            |				\
+     FAST_PATH_NO_ACCESSORS            |				\
+     FAST_PATH_HAS_TRANSFORM           |				\
+     FAST_PATH_AFFINE_TRANSFORM        |				\
+     FAST_PATH_SEPARABLE_CONVOLUTION_FILTER)
+    
+#define SEPARABLE_CONVOLUTION_AFFINE_FAST_PATH(name, format, repeat)   \
+    { PIXMAN_ ## format,                                               \
+      GENERAL_SEPARABLE_CONVOLUTION_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \
+      bits_image_fetch_separable_convolution_affine_ ## name,          \
+      _pixman_image_get_scanline_generic_float			       \
+    },
+
 #define BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
     { PIXMAN_ ## format,						\
       GENERAL_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,		\
       bits_image_fetch_bilinear_affine_ ## name,			\
-      _pixman_image_get_scanline_generic_64				\
+      _pixman_image_get_scanline_generic_float				\
     },
 
-    BILINEAR_AFFINE_FAST_PATH (pad_a8r8g8b8, a8r8g8b8, PAD)
-    BILINEAR_AFFINE_FAST_PATH (none_a8r8g8b8, a8r8g8b8, NONE)
-    BILINEAR_AFFINE_FAST_PATH (reflect_a8r8g8b8, a8r8g8b8, REFLECT)
-    BILINEAR_AFFINE_FAST_PATH (normal_a8r8g8b8, a8r8g8b8, NORMAL)
-    BILINEAR_AFFINE_FAST_PATH (pad_x8r8g8b8, x8r8g8b8, PAD)
-    BILINEAR_AFFINE_FAST_PATH (none_x8r8g8b8, x8r8g8b8, NONE)
-    BILINEAR_AFFINE_FAST_PATH (reflect_x8r8g8b8, x8r8g8b8, REFLECT)
-    BILINEAR_AFFINE_FAST_PATH (normal_x8r8g8b8, x8r8g8b8, NORMAL)
-    BILINEAR_AFFINE_FAST_PATH (pad_a8, a8, PAD)
-    BILINEAR_AFFINE_FAST_PATH (none_a8, a8, NONE)
-    BILINEAR_AFFINE_FAST_PATH (reflect_a8, a8, REFLECT)
-    BILINEAR_AFFINE_FAST_PATH (normal_a8, a8, NORMAL)
-    BILINEAR_AFFINE_FAST_PATH (pad_r5g6b5, r5g6b5, PAD)
-    BILINEAR_AFFINE_FAST_PATH (none_r5g6b5, r5g6b5, NONE)
-    BILINEAR_AFFINE_FAST_PATH (reflect_r5g6b5, r5g6b5, REFLECT)
-    BILINEAR_AFFINE_FAST_PATH (normal_r5g6b5, r5g6b5, NORMAL)
+#define NEAREST_AFFINE_FAST_PATH(name, format, repeat)			\
+    { PIXMAN_ ## format,						\
+      GENERAL_NEAREST_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,		\
+      bits_image_fetch_nearest_affine_ ## name,				\
+      _pixman_image_get_scanline_generic_float				\
+    },
+
+#define AFFINE_FAST_PATHS(name, format, repeat)				\
+    SEPARABLE_CONVOLUTION_AFFINE_FAST_PATH(name, format, repeat)	\
+    BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
+    NEAREST_AFFINE_FAST_PATH(name, format, repeat)
+    
+    AFFINE_FAST_PATHS (pad_a8r8g8b8, a8r8g8b8, PAD)
+    AFFINE_FAST_PATHS (none_a8r8g8b8, a8r8g8b8, NONE)
+    AFFINE_FAST_PATHS (reflect_a8r8g8b8, a8r8g8b8, REFLECT)
+    AFFINE_FAST_PATHS (normal_a8r8g8b8, a8r8g8b8, NORMAL)
+    AFFINE_FAST_PATHS (pad_x8r8g8b8, x8r8g8b8, PAD)
+    AFFINE_FAST_PATHS (none_x8r8g8b8, x8r8g8b8, NONE)
+    AFFINE_FAST_PATHS (reflect_x8r8g8b8, x8r8g8b8, REFLECT)
+    AFFINE_FAST_PATHS (normal_x8r8g8b8, x8r8g8b8, NORMAL)
+    AFFINE_FAST_PATHS (pad_a8, a8, PAD)
+    AFFINE_FAST_PATHS (none_a8, a8, NONE)
+    AFFINE_FAST_PATHS (reflect_a8, a8, REFLECT)
+    AFFINE_FAST_PATHS (normal_a8, a8, NORMAL)
+    AFFINE_FAST_PATHS (pad_r5g6b5, r5g6b5, PAD)
+    AFFINE_FAST_PATHS (none_r5g6b5, r5g6b5, NONE)
+    AFFINE_FAST_PATHS (reflect_r5g6b5, r5g6b5, REFLECT)
+    AFFINE_FAST_PATHS (normal_r5g6b5, r5g6b5, NORMAL)
 
     /* Affine, no alpha */
     { PIXMAN_any,
       (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_HAS_TRANSFORM | FAST_PATH_AFFINE_TRANSFORM),
       bits_image_fetch_affine_no_alpha,
-      _pixman_image_get_scanline_generic_64
+      _pixman_image_get_scanline_generic_float
     },
 
     /* General */
-    { PIXMAN_any, 0, bits_image_fetch_general, _pixman_image_get_scanline_generic_64 },
+    { PIXMAN_any,
+      0,
+      bits_image_fetch_general,
+      _pixman_image_get_scanline_generic_float
+    },
 
     { PIXMAN_null },
 };
@@ -1215,24 +1487,179 @@ static const fetcher_info_t fetcher_info[] =
 static void
 bits_image_property_changed (pixman_image_t *image)
 {
-    uint32_t flags = image->common.flags;
+    _pixman_bits_image_setup_accessors (&image->bits);
+}
+
+void
+_pixman_bits_image_src_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
     pixman_format_code_t format = image->common.extended_format_code;
+    uint32_t flags = image->common.flags;
     const fetcher_info_t *info;
 
-    _pixman_bits_image_setup_accessors (&image->bits);
-
-    info = fetcher_info;
-    while (info->format != PIXMAN_null)
+    for (info = fetcher_info; info->format != PIXMAN_null; ++info)
     {
 	if ((info->format == format || info->format == PIXMAN_any)	&&
 	    (info->flags & flags) == info->flags)
 	{
-	    image->common.get_scanline_32 = info->fetch_32;
-	    image->common.get_scanline_64 = info->fetch_64;
-	    break;
+	    if (iter->iter_flags & ITER_NARROW)
+	    {
+		iter->get_scanline = info->get_scanline_32;
+	    }
+	    else
+	    {
+		iter->data = info->get_scanline_32;
+		iter->get_scanline = info->get_scanline_float;
+	    }
+	    return;
 	}
+    }
 
-	info++;
+    /* Just in case we somehow didn't find a scanline function */
+    iter->get_scanline = _pixman_iter_get_scanline_noop;
+}
+
+static uint32_t *
+dest_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+{
+    pixman_image_t *image  = iter->image;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    uint32_t *	    buffer = iter->buffer;
+
+    image->bits.fetch_scanline_32 (image, x, y, width, buffer, mask);
+    if (image->common.alpha_map)
+    {
+	uint32_t *alpha;
+
+	if ((alpha = malloc (width * sizeof (uint32_t))))
+	{
+	    int i;
+
+	    x -= image->common.alpha_origin_x;
+	    y -= image->common.alpha_origin_y;
+
+	    image->common.alpha_map->fetch_scanline_32 (
+		(pixman_image_t *)image->common.alpha_map,
+		x, y, width, alpha, mask);
+
+	    for (i = 0; i < width; ++i)
+	    {
+		buffer[i] &= ~0xff000000;
+		buffer[i] |= (alpha[i] & 0xff000000);
+	    }
+
+	    free (alpha);
+	}
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
+dest_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    bits_image_t *  image  = &iter->image->bits;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    argb_t *	    buffer = (argb_t *)iter->buffer;
+
+    image->fetch_scanline_float (
+	(pixman_image_t *)image, x, y, width, (uint32_t *)buffer, mask);
+    if (image->common.alpha_map)
+    {
+	argb_t *alpha;
+
+	if ((alpha = malloc (width * sizeof (argb_t))))
+	{
+	    int i;
+
+	    x -= image->common.alpha_origin_x;
+	    y -= image->common.alpha_origin_y;
+
+	    image->common.alpha_map->fetch_scanline_float (
+		(pixman_image_t *)image->common.alpha_map,
+		x, y, width, (uint32_t *)alpha, mask);
+
+	    for (i = 0; i < width; ++i)
+		buffer[i].a = alpha[i].a;
+
+	    free (alpha);
+	}
+    }
+
+    return iter->buffer;
+}
+
+static void
+dest_write_back_narrow (pixman_iter_t *iter)
+{
+    bits_image_t *  image  = &iter->image->bits;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    const uint32_t *buffer = iter->buffer;
+
+    image->store_scanline_32 (image, x, y, width, buffer);
+
+    if (image->common.alpha_map)
+    {
+	x -= image->common.alpha_origin_x;
+	y -= image->common.alpha_origin_y;
+
+	image->common.alpha_map->store_scanline_32 (
+	    image->common.alpha_map, x, y, width, buffer);
+    }
+
+    iter->y++;
+}
+
+static void
+dest_write_back_wide (pixman_iter_t *iter)
+{
+    bits_image_t *  image  = &iter->image->bits;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    const uint32_t *buffer = iter->buffer;
+
+    image->store_scanline_float (image, x, y, width, buffer);
+
+    if (image->common.alpha_map)
+    {
+	x -= image->common.alpha_origin_x;
+	y -= image->common.alpha_origin_y;
+
+	image->common.alpha_map->store_scanline_float (
+	    image->common.alpha_map, x, y, width, buffer);
+    }
+
+    iter->y++;
+}
+
+void
+_pixman_bits_image_dest_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+    if (iter->iter_flags & ITER_NARROW)
+    {
+	if ((iter->iter_flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) ==
+	    (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA))
+	{
+	    iter->get_scanline = _pixman_iter_get_scanline_noop;
+	}
+	else
+	{
+	    iter->get_scanline = dest_get_scanline_narrow;
+	}
+	
+	iter->write_back = dest_write_back_narrow;
+    }
+    else
+    {
+	iter->get_scanline = dest_get_scanline_wide;
+	iter->write_back = dest_write_back_wide;
     }
 }
 
@@ -1240,10 +1667,11 @@ static uint32_t *
 create_bits (pixman_format_code_t format,
              int                  width,
              int                  height,
-             int *                rowstride_bytes)
+             int *		  rowstride_bytes,
+	     pixman_bool_t	  clear)
 {
     int stride;
-    int buf_size;
+    size_t buf_size;
     int bpp;
 
     /* what follows is a long-winded way, avoiding any possibility of integer
@@ -1252,11 +1680,11 @@ create_bits (pixman_format_code_t format,
      */
 
     bpp = PIXMAN_FORMAT_BPP (format);
-    if (pixman_multiply_overflows_int (width, bpp))
+    if (_pixman_multiply_overflows_int (width, bpp))
 	return NULL;
 
     stride = width * bpp;
-    if (pixman_addition_overflows_int (stride, 0x1f))
+    if (_pixman_addition_overflows_int (stride, 0x1f))
 	return NULL;
 
     stride += 0x1f;
@@ -1264,7 +1692,7 @@ create_bits (pixman_format_code_t format,
 
     stride *= sizeof (uint32_t);
 
-    if (pixman_multiply_overflows_int (height, stride))
+    if (_pixman_multiply_overflows_size (height, stride))
 	return NULL;
 
     buf_size = height * stride;
@@ -1272,42 +1700,36 @@ create_bits (pixman_format_code_t format,
     if (rowstride_bytes)
 	*rowstride_bytes = stride;
 
-    return calloc (buf_size, 1);
+    if (clear)
+	return calloc (buf_size, 1);
+    else
+	return malloc (buf_size);
 }
 
-PIXMAN_EXPORT pixman_image_t *
-pixman_image_create_bits (pixman_format_code_t format,
-                          int                  width,
-                          int                  height,
-                          uint32_t *           bits,
-                          int                  rowstride_bytes)
+pixman_bool_t
+_pixman_bits_image_init (pixman_image_t *     image,
+                         pixman_format_code_t format,
+                         int                  width,
+                         int                  height,
+                         uint32_t *           bits,
+                         int                  rowstride,
+			 pixman_bool_t	      clear)
 {
-    pixman_image_t *image;
     uint32_t *free_me = NULL;
 
-    /* must be a whole number of uint32_t's
-     */
-    return_val_if_fail (
-	bits == NULL || (rowstride_bytes % sizeof (uint32_t)) == 0, NULL);
-
-    return_val_if_fail (PIXMAN_FORMAT_BPP (format) >= PIXMAN_FORMAT_DEPTH (format), NULL);
-
     if (!bits && width && height)
     {
-	free_me = bits = create_bits (format, width, height, &rowstride_bytes);
+	int rowstride_bytes;
+
+	free_me = bits = create_bits (format, width, height, &rowstride_bytes, clear);
+
 	if (!bits)
-	    return NULL;
+	    return FALSE;
+
+	rowstride = rowstride_bytes / (int) sizeof (uint32_t);
     }
 
-    image = _pixman_image_allocate ();
-
-    if (!image)
-    {
-	if (free_me)
-	    free (free_me);
-
-	return NULL;
-    }
+    _pixman_image_init (image);
 
     image->type = BITS;
     image->bits.format = format;
@@ -1317,15 +1739,70 @@ pixman_image_create_bits (pixman_format_code_t format,
     image->bits.free_me = free_me;
     image->bits.read_func = NULL;
     image->bits.write_func = NULL;
-
-    /* The rowstride is stored in number of uint32_t */
-    image->bits.rowstride = rowstride_bytes / (int) sizeof (uint32_t);
-
+    image->bits.rowstride = rowstride;
     image->bits.indexed = NULL;
 
     image->common.property_changed = bits_image_property_changed;
 
     _pixman_image_reset_clip_region (image);
 
+    return TRUE;
+}
+
+static pixman_image_t *
+create_bits_image_internal (pixman_format_code_t format,
+			    int                  width,
+			    int                  height,
+			    uint32_t *           bits,
+			    int                  rowstride_bytes,
+			    pixman_bool_t	 clear)
+{
+    pixman_image_t *image;
+
+    /* must be a whole number of uint32_t's
+     */
+    return_val_if_fail (
+	bits == NULL || (rowstride_bytes % sizeof (uint32_t)) == 0, NULL);
+
+    return_val_if_fail (PIXMAN_FORMAT_BPP (format) >= PIXMAN_FORMAT_DEPTH (format), NULL);
+
+    image = _pixman_image_allocate ();
+
+    if (!image)
+	return NULL;
+
+    if (!_pixman_bits_image_init (image, format, width, height, bits,
+				  rowstride_bytes / (int) sizeof (uint32_t),
+				  clear))
+    {
+	free (image);
+	return NULL;
+    }
+
     return image;
 }
+
+/* If bits is NULL, a buffer will be allocated and initialized to 0 */
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_bits (pixman_format_code_t format,
+                          int                  width,
+                          int                  height,
+                          uint32_t *           bits,
+                          int                  rowstride_bytes)
+{
+    return create_bits_image_internal (
+	format, width, height, bits, rowstride_bytes, TRUE);
+}
+
+
+/* If bits is NULL, a buffer will be allocated and _not_ initialized */
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_bits_no_clear (pixman_format_code_t format,
+				   int                  width,
+				   int                  height,
+				   uint32_t *           bits,
+				   int                  rowstride_bytes)
+{
+    return create_bits_image_internal (
+	format, width, height, bits, rowstride_bytes, FALSE);
+}
diff --git a/programs/develop/libraries/pixman/pixman-combine-float.c b/programs/develop/libraries/pixman/pixman-combine-float.c
new file mode 100644
index 0000000000..5ea739f766
--- /dev/null
+++ b/programs/develop/libraries/pixman/pixman-combine-float.c
@@ -0,0 +1,1016 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2010, 2012 Soren Sandmann Pedersen
+ * Copyright © 2010, 2012 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Soren Sandmann Pedersen (sandmann@cs.au.dk)
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include <string.h>
+#include <float.h>
+
+#include "pixman-private.h"
+
+/* Workaround for http://gcc.gnu.org/PR54965 */
+/* GCC 4.6 has problems with force_inline, so just use normal inline instead */
+#if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 6)
+#undef force_inline
+#define force_inline __inline__
+#endif
+
+typedef float (* combine_channel_t) (float sa, float s, float da, float d);
+
+static force_inline void
+combine_inner (pixman_bool_t component,
+	       float *dest, const float *src, const float *mask, int n_pixels,
+	       combine_channel_t combine_a, combine_channel_t combine_c)
+{
+    int i;
+
+    if (!mask)
+    {
+	for (i = 0; i < 4 * n_pixels; i += 4)
+	{
+	    float sa = src[i + 0];
+	    float sr = src[i + 1];
+	    float sg = src[i + 2];
+	    float sb = src[i + 3];
+	    
+	    float da = dest[i + 0];
+	    float dr = dest[i + 1];
+	    float dg = dest[i + 2];
+	    float db = dest[i + 3];					
+	    
+	    dest[i + 0] = combine_a (sa, sa, da, da);
+	    dest[i + 1] = combine_c (sa, sr, da, dr);
+	    dest[i + 2] = combine_c (sa, sg, da, dg);
+	    dest[i + 3] = combine_c (sa, sb, da, db);
+	}
+    }
+    else
+    {
+	for (i = 0; i < 4 * n_pixels; i += 4)
+	{
+	    float sa, sr, sg, sb;
+	    float ma, mr, mg, mb;
+	    float da, dr, dg, db;
+	    
+	    sa = src[i + 0];
+	    sr = src[i + 1];
+	    sg = src[i + 2];
+	    sb = src[i + 3];
+	    
+	    if (component)
+	    {
+		ma = mask[i + 0];
+		mr = mask[i + 1];
+		mg = mask[i + 2];
+		mb = mask[i + 3];
+
+		sr *= mr;
+		sg *= mg;
+		sb *= mb;
+
+		ma *= sa;
+		mr *= sa;
+		mg *= sa;
+		mb *= sa;
+		
+		sa = ma;
+	    }
+	    else
+	    {
+		ma = mask[i + 0];
+
+		sa *= ma;
+		sr *= ma;
+		sg *= ma;
+		sb *= ma;
+
+		ma = mr = mg = mb = sa;
+	    }
+	    
+	    da = dest[i + 0];
+	    dr = dest[i + 1];
+	    dg = dest[i + 2];
+	    db = dest[i + 3];
+	    
+	    dest[i + 0] = combine_a (ma, sa, da, da);
+	    dest[i + 1] = combine_c (mr, sr, da, dr);
+	    dest[i + 2] = combine_c (mg, sg, da, dg);
+	    dest[i + 3] = combine_c (mb, sb, da, db);
+	}
+    }
+}
+
+#define MAKE_COMBINER(name, component, combine_a, combine_c)		\
+    static void								\
+    combine_ ## name ## _float (pixman_implementation_t *imp,		\
+				pixman_op_t              op,		\
+				float                   *dest,		\
+				const float             *src,		\
+				const float             *mask,		\
+				int		         n_pixels)	\
+    {									\
+	combine_inner (component, dest, src, mask, n_pixels,		\
+		       combine_a, combine_c);				\
+    }
+
+#define MAKE_COMBINERS(name, combine_a, combine_c)			\
+    MAKE_COMBINER(name ## _ca, TRUE, combine_a, combine_c)		\
+    MAKE_COMBINER(name ## _u, FALSE, combine_a, combine_c)
+
+
+/*
+ * Porter/Duff operators
+ */
+typedef enum
+{
+    ZERO,
+    ONE,
+    SRC_ALPHA,
+    DEST_ALPHA,
+    INV_SA,
+    INV_DA,
+    SA_OVER_DA,
+    DA_OVER_SA,
+    INV_SA_OVER_DA,
+    INV_DA_OVER_SA,
+    ONE_MINUS_SA_OVER_DA,
+    ONE_MINUS_DA_OVER_SA,
+    ONE_MINUS_INV_DA_OVER_SA,
+    ONE_MINUS_INV_SA_OVER_DA
+} combine_factor_t;
+
+#define CLAMP(f)					\
+    (((f) < 0)? 0 : (((f) > 1.0) ? 1.0 : (f)))
+
+static force_inline float
+get_factor (combine_factor_t factor, float sa, float da)
+{
+    float f = -1;
+
+    switch (factor)
+    {
+    case ZERO:
+	f = 0.0f;
+	break;
+
+    case ONE:
+	f = 1.0f;
+	break;
+
+    case SRC_ALPHA:
+	f = sa;
+	break;
+
+    case DEST_ALPHA:
+	f = da;
+	break;
+
+    case INV_SA:
+	f = 1 - sa;
+	break;
+
+    case INV_DA:
+	f = 1 - da;
+	break;
+
+    case SA_OVER_DA:
+	if (FLOAT_IS_ZERO (da))
+	    f = 1.0f;
+	else
+	    f = CLAMP (sa / da);
+	break;
+
+    case DA_OVER_SA:
+	if (FLOAT_IS_ZERO (sa))
+	    f = 1.0f;
+	else
+	    f = CLAMP (da / sa);
+	break;
+
+    case INV_SA_OVER_DA:
+	if (FLOAT_IS_ZERO (da))
+	    f = 1.0f;
+	else
+	    f = CLAMP ((1.0f - sa) / da);
+	break;
+
+    case INV_DA_OVER_SA:
+	if (FLOAT_IS_ZERO (sa))
+	    f = 1.0f;
+	else
+	    f = CLAMP ((1.0f - da) / sa);
+	break;
+
+    case ONE_MINUS_SA_OVER_DA:
+	if (FLOAT_IS_ZERO (da))
+	    f = 0.0f;
+	else
+	    f = CLAMP (1.0f - sa / da);
+	break;
+
+    case ONE_MINUS_DA_OVER_SA:
+	if (FLOAT_IS_ZERO (sa))
+	    f = 0.0f;
+	else
+	    f = CLAMP (1.0f - da / sa);
+	break;
+
+    case ONE_MINUS_INV_DA_OVER_SA:
+	if (FLOAT_IS_ZERO (sa))
+	    f = 0.0f;
+	else
+	    f = CLAMP (1.0f - (1.0f - da) / sa);
+	break;
+
+    case ONE_MINUS_INV_SA_OVER_DA:
+	if (FLOAT_IS_ZERO (da))
+	    f = 0.0f;
+	else
+	    f = CLAMP (1.0f - (1.0f - sa) / da);
+	break;
+    }
+
+    return f;
+}
+
+#define MAKE_PD_COMBINERS(name, a, b)					\
+    static float force_inline						\
+    pd_combine_ ## name (float sa, float s, float da, float d)		\
+    {									\
+	const float fa = get_factor (a, sa, da);			\
+	const float fb = get_factor (b, sa, da);			\
+									\
+	return MIN (1.0f, s * fa + d * fb);				\
+    }									\
+    									\
+    MAKE_COMBINERS(name, pd_combine_ ## name, pd_combine_ ## name)
+
+MAKE_PD_COMBINERS (clear,			ZERO,				ZERO)
+MAKE_PD_COMBINERS (src,				ONE,				ZERO)
+MAKE_PD_COMBINERS (dst,				ZERO,				ONE)
+MAKE_PD_COMBINERS (over,			ONE,				INV_SA)
+MAKE_PD_COMBINERS (over_reverse,		INV_DA,				ONE)
+MAKE_PD_COMBINERS (in,				DEST_ALPHA,			ZERO)
+MAKE_PD_COMBINERS (in_reverse,			ZERO,				SRC_ALPHA)
+MAKE_PD_COMBINERS (out,				INV_DA,				ZERO)
+MAKE_PD_COMBINERS (out_reverse,			ZERO,				INV_SA)
+MAKE_PD_COMBINERS (atop,			DEST_ALPHA,			INV_SA)
+MAKE_PD_COMBINERS (atop_reverse,		INV_DA,				SRC_ALPHA)
+MAKE_PD_COMBINERS (xor,				INV_DA,				INV_SA)
+MAKE_PD_COMBINERS (add,				ONE,				ONE)
+
+MAKE_PD_COMBINERS (saturate,			INV_DA_OVER_SA,			ONE)
+
+MAKE_PD_COMBINERS (disjoint_clear,		ZERO,				ZERO)
+MAKE_PD_COMBINERS (disjoint_src,		ONE,				ZERO)
+MAKE_PD_COMBINERS (disjoint_dst,		ZERO,				ONE)
+MAKE_PD_COMBINERS (disjoint_over,		ONE,				INV_SA_OVER_DA)
+MAKE_PD_COMBINERS (disjoint_over_reverse,	INV_DA_OVER_SA,			ONE)
+MAKE_PD_COMBINERS (disjoint_in,			ONE_MINUS_INV_DA_OVER_SA,	ZERO)
+MAKE_PD_COMBINERS (disjoint_in_reverse,		ZERO,				ONE_MINUS_INV_SA_OVER_DA)
+MAKE_PD_COMBINERS (disjoint_out,		INV_DA_OVER_SA,			ZERO)
+MAKE_PD_COMBINERS (disjoint_out_reverse,	ZERO,				INV_SA_OVER_DA)
+MAKE_PD_COMBINERS (disjoint_atop,		ONE_MINUS_INV_DA_OVER_SA,	INV_SA_OVER_DA)
+MAKE_PD_COMBINERS (disjoint_atop_reverse,	INV_DA_OVER_SA,			ONE_MINUS_INV_SA_OVER_DA)
+MAKE_PD_COMBINERS (disjoint_xor,		INV_DA_OVER_SA,			INV_SA_OVER_DA)
+
+MAKE_PD_COMBINERS (conjoint_clear,		ZERO,				ZERO)
+MAKE_PD_COMBINERS (conjoint_src,		ONE,				ZERO)
+MAKE_PD_COMBINERS (conjoint_dst,		ZERO,				ONE)
+MAKE_PD_COMBINERS (conjoint_over,		ONE,				ONE_MINUS_SA_OVER_DA)
+MAKE_PD_COMBINERS (conjoint_over_reverse,	ONE_MINUS_DA_OVER_SA,		ONE)
+MAKE_PD_COMBINERS (conjoint_in,			DA_OVER_SA,			ZERO)
+MAKE_PD_COMBINERS (conjoint_in_reverse,		ZERO,				SA_OVER_DA)
+MAKE_PD_COMBINERS (conjoint_out,		ONE_MINUS_DA_OVER_SA,		ZERO)
+MAKE_PD_COMBINERS (conjoint_out_reverse,	ZERO,				ONE_MINUS_SA_OVER_DA)
+MAKE_PD_COMBINERS (conjoint_atop,		DA_OVER_SA,			ONE_MINUS_SA_OVER_DA)
+MAKE_PD_COMBINERS (conjoint_atop_reverse,	ONE_MINUS_DA_OVER_SA,		SA_OVER_DA)
+MAKE_PD_COMBINERS (conjoint_xor,		ONE_MINUS_DA_OVER_SA,		ONE_MINUS_SA_OVER_DA)
+
+/*
+ * PDF blend modes:
+ *
+ * The following blend modes have been taken from the PDF ISO 32000
+ * specification, which at this point in time is available from
+ * http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf
+ * The relevant chapters are 11.3.5 and 11.3.6.
+ * The formula for computing the final pixel color given in 11.3.6 is:
+ * αr × Cr = (1 – αs) × αb × Cb + (1 – αb) × αs × Cs + αb × αs × B(Cb, Cs)
+ * with B() being the blend function.
+ * Note that OVER is a special case of this operation, using B(Cb, Cs) = Cs
+ *
+ * These blend modes should match the SVG filter draft specification, as
+ * it has been designed to mirror ISO 32000. Note that at the current point
+ * no released draft exists that shows this, as the formulas have not been
+ * updated yet after the release of ISO 32000.
+ *
+ * The default implementation here uses the PDF_SEPARABLE_BLEND_MODE and
+ * PDF_NON_SEPARABLE_BLEND_MODE macros, which take the blend function as an
+ * argument. Note that this implementation operates on premultiplied colors,
+ * while the PDF specification does not. Therefore the code uses the formula
+ * ar.Cra = (1 – as) . Dca + (1 – ad) . Sca + B(Dca, ad, Sca, as)
+ */
+
+#define MAKE_SEPARABLE_PDF_COMBINERS(name)				\
+    static force_inline float						\
+    combine_ ## name ## _a (float sa, float s, float da, float d)	\
+    {									\
+	return da + sa - da * sa;					\
+    }									\
+    									\
+    static force_inline float						\
+    combine_ ## name ## _c (float sa, float s, float da, float d)	\
+    {									\
+	float f = (1 - sa) * d + (1 - da) * s;				\
+									\
+	return f + blend_ ## name (sa, s, da, d);			\
+    }									\
+    									\
+    MAKE_COMBINERS (name, combine_ ## name ## _a, combine_ ## name ## _c)
+
+static force_inline float
+blend_multiply (float sa, float s, float da, float d)
+{
+    return d * s;
+}
+
+static force_inline float
+blend_screen (float sa, float s, float da, float d)
+{
+    return d * sa + s * da - s * d;
+}
+
+static force_inline float
+blend_overlay (float sa, float s, float da, float d)
+{
+    if (2 * d < da)
+	return 2 * s * d;
+    else
+	return sa * da - 2 * (da - d) * (sa - s);
+}
+
+static force_inline float
+blend_darken (float sa, float s, float da, float d)
+{
+    s = s * da;
+    d = d * sa;
+
+    if (s > d)
+	return d;
+    else
+	return s;
+}
+
+static force_inline float
+blend_lighten (float sa, float s, float da, float d)
+{
+    s = s * da;
+    d = d * sa;
+
+    if (s > d)
+	return s;
+    else
+	return d;
+}
+
+static force_inline float
+blend_color_dodge (float sa, float s, float da, float d)
+{
+    if (FLOAT_IS_ZERO (d))
+	return 0.0f;
+    else if (d * sa >= sa * da - s * da)
+	return sa * da;
+    else if (FLOAT_IS_ZERO (sa - s))
+	return sa * da;
+    else
+	return sa * sa * d / (sa - s);
+}
+
+static force_inline float
+blend_color_burn (float sa, float s, float da, float d)
+{
+    if (d >= da)
+	return sa * da;
+    else if (sa * (da - d) >= s * da)
+	return 0.0f;
+    else if (FLOAT_IS_ZERO (s))
+	return 0.0f;
+    else
+	return sa * (da - sa * (da - d) / s);
+}
+
+static force_inline float
+blend_hard_light (float sa, float s, float da, float d)
+{
+    if (2 * s < sa)
+	return 2 * s * d;
+    else
+	return sa * da - 2 * (da - d) * (sa - s);
+}
+
+static force_inline float
+blend_soft_light (float sa, float s, float da, float d)
+{
+    if (2 * s < sa)
+    {
+	if (FLOAT_IS_ZERO (da))
+	    return d * sa;
+	else
+	    return d * sa - d * (da - d) * (sa - 2 * s) / da;
+    }
+    else
+    {
+	if (FLOAT_IS_ZERO (da))
+	{
+	    return 0.0f;
+	}
+	else
+	{
+	    if (4 * d <= da)
+		return d * sa + (2 * s - sa) * d * ((16 * d / da - 12) * d / da + 3);
+	    else
+		return d * sa + (sqrtf (d * da) - d) * (2 * s - sa);
+	}
+    }
+}
+
+static force_inline float
+blend_difference (float sa, float s, float da, float d)
+{
+    float dsa = d * sa;
+    float sda = s * da;
+
+    if (sda < dsa)
+	return dsa - sda;
+    else
+	return sda - dsa;
+}
+
+static force_inline float
+blend_exclusion (float sa, float s, float da, float d)
+{
+    return s * da + d * sa - 2 * d * s;
+}
+
+MAKE_SEPARABLE_PDF_COMBINERS (multiply)
+MAKE_SEPARABLE_PDF_COMBINERS (screen)
+MAKE_SEPARABLE_PDF_COMBINERS (overlay)
+MAKE_SEPARABLE_PDF_COMBINERS (darken)
+MAKE_SEPARABLE_PDF_COMBINERS (lighten)
+MAKE_SEPARABLE_PDF_COMBINERS (color_dodge)
+MAKE_SEPARABLE_PDF_COMBINERS (color_burn)
+MAKE_SEPARABLE_PDF_COMBINERS (hard_light)
+MAKE_SEPARABLE_PDF_COMBINERS (soft_light)
+MAKE_SEPARABLE_PDF_COMBINERS (difference)
+MAKE_SEPARABLE_PDF_COMBINERS (exclusion)
+
+/*
+ * PDF nonseperable blend modes.
+ *
+ * These are implemented using the following functions to operate in Hsl
+ * space, with Cmax, Cmid, Cmin referring to the max, mid and min value
+ * of the red, green and blue components.
+ *
+ * LUM (C) = 0.3 × Cred + 0.59 × Cgreen + 0.11 × Cblue
+ *
+ * clip_color (C):
+ *   l = LUM (C)
+ *   min = Cmin
+ *   max = Cmax
+ *   if n < 0.0
+ *     C = l + (((C – l) × l) ⁄     (l – min))
+ *   if x > 1.0
+ *     C = l + (((C – l) × (1 – l)) (max – l))
+ *   return C
+ *
+ * set_lum (C, l):
+ *   d = l – LUM (C)
+ *   C += d
+ *   return clip_color (C)
+ *
+ * SAT (C) = CH_MAX (C) - CH_MIN (C)
+ *
+ * set_sat (C, s):
+ *  if Cmax > Cmin
+ *    Cmid = ( ( ( Cmid – Cmin ) × s ) ⁄ ( Cmax – Cmin ) )
+ *    Cmax = s
+ *  else
+ *    Cmid = Cmax = 0.0
+ *  Cmin = 0.0
+ *  return C
+ */
+
+/* For premultiplied colors, we need to know what happens when C is
+ * multiplied by a real number. LUM and SAT are linear:
+ *
+ *    LUM (r × C) = r × LUM (C)		SAT (r × C) = r × SAT (C)
+ *
+ * If we extend clip_color with an extra argument a and change
+ *
+ *        if x >= 1.0
+ *
+ * into
+ *
+ *        if x >= a
+ *
+ * then clip_color is also linear:
+ *
+ *     r * clip_color (C, a) = clip_color (r_c, ra);
+ *
+ * for positive r.
+ *
+ * Similarly, we can extend set_lum with an extra argument that is just passed
+ * on to clip_color:
+ *
+ *     r × set_lum ( C, l, a)
+ *
+ *   = r × clip_color ( C + l - LUM (C), a)
+ *
+ *   = clip_color ( r * C + r × l - LUM (r × C), r * a)
+ *
+ *   = set_lum ( r * C, r * l, r * a)
+ *
+ * Finally, set_sat:
+ *
+ *     r * set_sat (C, s) = set_sat (x * C, r * s)
+ *
+ * The above holds for all non-zero x because they x'es in the fraction for
+ * C_mid cancel out. Specifically, it holds for x = r:
+ *
+ *     r * set_sat (C, s) = set_sat (r_c, rs)
+ *
+ *
+ *
+ *
+ * So, for the non-separable PDF blend modes, we have (using s, d for
+ * non-premultiplied colors, and S, D for premultiplied:
+ *
+ *   Color:
+ *
+ *     a_s * a_d * B(s, d)
+ *   = a_s * a_d * set_lum (S/a_s, LUM (D/a_d), 1)
+ *   = set_lum (S * a_d, a_s * LUM (D), a_s * a_d)
+ *
+ *
+ *   Luminosity:
+ *
+ *     a_s * a_d * B(s, d)
+ *   = a_s * a_d * set_lum (D/a_d, LUM(S/a_s), 1)
+ *   = set_lum (a_s * D, a_d * LUM(S), a_s * a_d)
+ *
+ *
+ *   Saturation:
+ *
+ *     a_s * a_d * B(s, d)
+ *   = a_s * a_d * set_lum (set_sat (D/a_d, SAT (S/a_s)), LUM (D/a_d), 1)
+ *   = set_lum (a_s * a_d * set_sat (D/a_d, SAT (S/a_s)),
+ *                                        a_s * LUM (D), a_s * a_d)
+ *   = set_lum (set_sat (a_s * D, a_d * SAT (S), a_s * LUM (D), a_s * a_d))
+ *
+ *   Hue:
+ *
+ *     a_s * a_d * B(s, d)
+ *   = a_s * a_d * set_lum (set_sat (S/a_s, SAT (D/a_d)), LUM (D/a_d), 1)
+ *   = set_lum (set_sat (a_d * S, a_s * SAT (D)), a_s * LUM (D), a_s * a_d)
+ *
+ */
+
+typedef struct
+{
+    float	r;
+    float	g;
+    float	b;
+} rgb_t;
+
+static force_inline float
+minf (float a, float b)
+{
+    return a < b? a : b;
+}
+
+static force_inline float
+maxf (float a, float b)
+{
+    return a > b? a : b;
+}
+
+static force_inline float
+channel_min (const rgb_t *c)
+{
+    return minf (minf (c->r, c->g), c->b);
+}
+
+static force_inline float
+channel_max (const rgb_t *c)
+{
+    return maxf (maxf (c->r, c->g), c->b);
+}
+
+static force_inline float
+get_lum (const rgb_t *c)
+{
+    return c->r * 0.3f + c->g * 0.59f + c->b * 0.11f;
+}
+
+static force_inline float
+get_sat (const rgb_t *c)
+{
+    return channel_max (c) - channel_min (c);
+}
+
+static void
+clip_color (rgb_t *color, float a)
+{
+    float l = get_lum (color);
+    float n = channel_min (color);
+    float x = channel_max (color);
+    float t;
+
+    if (n < 0.0f)
+    {
+	t = l - n;
+	if (FLOAT_IS_ZERO (t))
+	{
+	    color->r = 0.0f;
+	    color->g = 0.0f;
+	    color->b = 0.0f;
+	}
+	else
+	{
+	    color->r = l + (((color->r - l) * l) / t);
+	    color->g = l + (((color->g - l) * l) / t);
+	    color->b = l + (((color->b - l) * l) / t);
+	}
+    }
+    if (x > a)
+    {
+	t = x - l;
+	if (FLOAT_IS_ZERO (t))
+	{
+	    color->r = a;
+	    color->g = a;
+	    color->b = a;
+	}
+	else
+	{
+	    color->r = l + (((color->r - l) * (a - l) / t));
+	    color->g = l + (((color->g - l) * (a - l) / t));
+	    color->b = l + (((color->b - l) * (a - l) / t));
+	}
+    }
+}
+
+static void
+set_lum (rgb_t *color, float sa, float l)
+{
+    float d = l - get_lum (color);
+
+    color->r = color->r + d;
+    color->g = color->g + d;
+    color->b = color->b + d;
+
+    clip_color (color, sa);
+}
+
+static void
+set_sat (rgb_t *src, float sat)
+{
+    float *max, *mid, *min;
+    float t;
+
+    if (src->r > src->g)
+    {
+	if (src->r > src->b)
+	{
+	    max = &(src->r);
+
+	    if (src->g > src->b)
+	    {
+		mid = &(src->g);
+		min = &(src->b);
+	    }
+	    else
+	    {
+		mid = &(src->b);
+		min = &(src->g);
+	    }
+	}
+	else
+	{
+	    max = &(src->b);
+	    mid = &(src->r);
+	    min = &(src->g);
+	}
+    }
+    else
+    {
+	if (src->r > src->b)
+	{
+	    max = &(src->g);
+	    mid = &(src->r);
+	    min = &(src->b);
+	}
+	else
+	{
+	    min = &(src->r);
+
+	    if (src->g > src->b)
+	    {
+		max = &(src->g);
+		mid = &(src->b);
+	    }
+	    else
+	    {
+		max = &(src->b);
+		mid = &(src->g);
+	    }
+	}
+    }
+
+    t = *max - *min;
+
+    if (FLOAT_IS_ZERO (t))
+    {
+	*mid = *max = 0.0f;
+    }
+    else
+    {
+	*mid = ((*mid - *min) * sat) / t;
+	*max = sat;
+    }
+
+    *min = 0.0f;
+}
+
+/*
+ * Hue:
+ * B(Cb, Cs) = set_lum (set_sat (Cs, SAT (Cb)), LUM (Cb))
+ */
+static force_inline void
+blend_hsl_hue (rgb_t *res,
+	       const rgb_t *dest, float da,
+	       const rgb_t *src, float sa)
+{
+    res->r = src->r * da;
+    res->g = src->g * da;
+    res->b = src->b * da;
+
+    set_sat (res, get_sat (dest) * sa);
+    set_lum (res, sa * da, get_lum (dest) * sa);
+}
+
+/*
+ * Saturation:
+ * B(Cb, Cs) = set_lum (set_sat (Cb, SAT (Cs)), LUM (Cb))
+ */
+static force_inline void
+blend_hsl_saturation (rgb_t *res,
+		      const rgb_t *dest, float da,
+		      const rgb_t *src, float sa)
+{
+    res->r = dest->r * sa;
+    res->g = dest->g * sa;
+    res->b = dest->b * sa;
+
+    set_sat (res, get_sat (src) * da);
+    set_lum (res, sa * da, get_lum (dest) * sa);
+}
+
+/*
+ * Color:
+ * B(Cb, Cs) = set_lum (Cs, LUM (Cb))
+ */
+static force_inline void
+blend_hsl_color (rgb_t *res,
+		 const rgb_t *dest, float da,
+		 const rgb_t *src, float sa)
+{
+    res->r = src->r * da;
+    res->g = src->g * da;
+    res->b = src->b * da;
+
+    set_lum (res, sa * da, get_lum (dest) * sa);
+}
+
+/*
+ * Luminosity:
+ * B(Cb, Cs) = set_lum (Cb, LUM (Cs))
+ */
+static force_inline void
+blend_hsl_luminosity (rgb_t *res,
+		      const rgb_t *dest, float da,
+		      const rgb_t *src, float sa)
+{
+    res->r = dest->r * sa;
+    res->g = dest->g * sa;
+    res->b = dest->b * sa;
+
+    set_lum (res, sa * da, get_lum (src) * da);
+}
+
+#define MAKE_NON_SEPARABLE_PDF_COMBINERS(name)				\
+    static void								\
+    combine_ ## name ## _u_float (pixman_implementation_t *imp,		\
+				  pixman_op_t              op,		\
+				  float                   *dest,	\
+				  const float             *src,		\
+				  const float             *mask,	\
+				  int		           n_pixels)	\
+    {									\
+    	int i;								\
+									\
+	for (i = 0; i < 4 * n_pixels; i += 4)				\
+	{								\
+	    float sa, da;						\
+	    rgb_t sc, dc, rc;						\
+									\
+	    sa = src[i + 0];						\
+	    sc.r = src[i + 1];						\
+	    sc.g = src[i + 2];						\
+	    sc.b = src[i + 3];						\
+									\
+	    da = dest[i + 0];						\
+	    dc.r = dest[i + 1];						\
+	    dc.g = dest[i + 2];						\
+	    dc.b = dest[i + 3];						\
+									\
+	    if (mask)							\
+	    {								\
+		float ma = mask[i + 0];					\
+									\
+		/* Component alpha is not supported for HSL modes */	\
+		sa *= ma;						\
+		sc.r *= ma;						\
+		sc.g *= ma;						\
+		sc.g *= ma;						\
+	    }								\
+									\
+	    blend_ ## name (&rc, &dc, da, &sc, sa);			\
+									\
+	    dest[i + 0] = sa + da - sa * da;				\
+	    dest[i + 1] = (1 - sa) * dc.r + (1 - da) * sc.r + rc.r;	\
+	    dest[i + 2] = (1 - sa) * dc.g + (1 - da) * sc.g + rc.g;	\
+	    dest[i + 3] = (1 - sa) * dc.b + (1 - da) * sc.b + rc.b;	\
+	}								\
+    }
+
+MAKE_NON_SEPARABLE_PDF_COMBINERS(hsl_hue)
+MAKE_NON_SEPARABLE_PDF_COMBINERS(hsl_saturation)
+MAKE_NON_SEPARABLE_PDF_COMBINERS(hsl_color)
+MAKE_NON_SEPARABLE_PDF_COMBINERS(hsl_luminosity)
+
+void
+_pixman_setup_combiner_functions_float (pixman_implementation_t *imp)
+{
+    /* Unified alpha */
+    imp->combine_float[PIXMAN_OP_CLEAR] = combine_clear_u_float;
+    imp->combine_float[PIXMAN_OP_SRC] = combine_src_u_float;
+    imp->combine_float[PIXMAN_OP_DST] = combine_dst_u_float;
+    imp->combine_float[PIXMAN_OP_OVER] = combine_over_u_float;
+    imp->combine_float[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_IN] = combine_in_u_float;
+    imp->combine_float[PIXMAN_OP_IN_REVERSE] = combine_in_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_OUT] = combine_out_u_float;
+    imp->combine_float[PIXMAN_OP_OUT_REVERSE] = combine_out_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_ATOP] = combine_atop_u_float;
+    imp->combine_float[PIXMAN_OP_ATOP_REVERSE] = combine_atop_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_XOR] = combine_xor_u_float;
+    imp->combine_float[PIXMAN_OP_ADD] = combine_add_u_float;
+    imp->combine_float[PIXMAN_OP_SATURATE] = combine_saturate_u_float;
+
+    /* Disjoint, unified */
+    imp->combine_float[PIXMAN_OP_DISJOINT_CLEAR] = combine_disjoint_clear_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_SRC] = combine_disjoint_src_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_DST] = combine_disjoint_dst_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_disjoint_over_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_IN_REVERSE] = combine_disjoint_in_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_OUT] = combine_disjoint_out_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_OUT_REVERSE] = combine_disjoint_out_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_ATOP] = combine_disjoint_atop_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = combine_disjoint_atop_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_XOR] = combine_disjoint_xor_u_float;
+
+    /* Conjoint, unified */
+    imp->combine_float[PIXMAN_OP_CONJOINT_CLEAR] = combine_conjoint_clear_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_SRC] = combine_conjoint_src_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_DST] = combine_conjoint_dst_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_IN_REVERSE] = combine_conjoint_in_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_OUT] = combine_conjoint_out_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_OUT_REVERSE] = combine_conjoint_out_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_ATOP] = combine_conjoint_atop_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = combine_conjoint_atop_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_XOR] = combine_conjoint_xor_u_float;
+
+    /* PDF operators, unified */
+    imp->combine_float[PIXMAN_OP_MULTIPLY] = combine_multiply_u_float;
+    imp->combine_float[PIXMAN_OP_SCREEN] = combine_screen_u_float;
+    imp->combine_float[PIXMAN_OP_OVERLAY] = combine_overlay_u_float;
+    imp->combine_float[PIXMAN_OP_DARKEN] = combine_darken_u_float;
+    imp->combine_float[PIXMAN_OP_LIGHTEN] = combine_lighten_u_float;
+    imp->combine_float[PIXMAN_OP_COLOR_DODGE] = combine_color_dodge_u_float;
+    imp->combine_float[PIXMAN_OP_COLOR_BURN] = combine_color_burn_u_float;
+    imp->combine_float[PIXMAN_OP_HARD_LIGHT] = combine_hard_light_u_float;
+    imp->combine_float[PIXMAN_OP_SOFT_LIGHT] = combine_soft_light_u_float;
+    imp->combine_float[PIXMAN_OP_DIFFERENCE] = combine_difference_u_float;
+    imp->combine_float[PIXMAN_OP_EXCLUSION] = combine_exclusion_u_float;
+
+    imp->combine_float[PIXMAN_OP_HSL_HUE] = combine_hsl_hue_u_float;
+    imp->combine_float[PIXMAN_OP_HSL_SATURATION] = combine_hsl_saturation_u_float;
+    imp->combine_float[PIXMAN_OP_HSL_COLOR] = combine_hsl_color_u_float;
+    imp->combine_float[PIXMAN_OP_HSL_LUMINOSITY] = combine_hsl_luminosity_u_float;
+
+    /* Component alpha combiners */
+    imp->combine_float_ca[PIXMAN_OP_CLEAR] = combine_clear_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_SRC] = combine_src_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DST] = combine_dst_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_OVER] = combine_over_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_IN] = combine_in_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_IN_REVERSE] = combine_in_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_OUT] = combine_out_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_OUT_REVERSE] = combine_out_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_ATOP] = combine_atop_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_ATOP_REVERSE] = combine_atop_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_XOR] = combine_xor_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_ADD] = combine_add_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_SATURATE] = combine_saturate_ca_float;
+
+    /* Disjoint CA */
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_CLEAR] = combine_disjoint_clear_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_SRC] = combine_disjoint_src_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_DST] = combine_disjoint_dst_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_disjoint_over_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_IN_REVERSE] = combine_disjoint_in_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_OUT] = combine_disjoint_out_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_OUT_REVERSE] = combine_disjoint_out_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_ATOP] = combine_disjoint_atop_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = combine_disjoint_atop_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_XOR] = combine_disjoint_xor_ca_float;
+
+    /* Conjoint CA */
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_CLEAR] = combine_conjoint_clear_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_SRC] = combine_conjoint_src_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_DST] = combine_conjoint_dst_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_IN_REVERSE] = combine_conjoint_in_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_OUT] = combine_conjoint_out_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_OUT_REVERSE] = combine_conjoint_out_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_ATOP] = combine_conjoint_atop_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = combine_conjoint_atop_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_XOR] = combine_conjoint_xor_ca_float;
+
+    /* PDF operators CA */
+    imp->combine_float_ca[PIXMAN_OP_MULTIPLY] = combine_multiply_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_SCREEN] = combine_screen_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_OVERLAY] = combine_overlay_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DARKEN] = combine_darken_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_LIGHTEN] = combine_lighten_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_COLOR_DODGE] = combine_color_dodge_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_COLOR_BURN] = combine_color_burn_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_HARD_LIGHT] = combine_hard_light_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_SOFT_LIGHT] = combine_soft_light_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DIFFERENCE] = combine_difference_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_EXCLUSION] = combine_exclusion_ca_float;
+
+    /* It is not clear that these make sense, so make them noops for now */
+    imp->combine_float_ca[PIXMAN_OP_HSL_HUE] = combine_dst_u_float;
+    imp->combine_float_ca[PIXMAN_OP_HSL_SATURATION] = combine_dst_u_float;
+    imp->combine_float_ca[PIXMAN_OP_HSL_COLOR] = combine_dst_u_float;
+    imp->combine_float_ca[PIXMAN_OP_HSL_LUMINOSITY] = combine_dst_u_float;
+}
diff --git a/programs/develop/libraries/pixman/pixman-combine32.c b/programs/develop/libraries/pixman/pixman-combine32.c
index d98d3f8383..3ac7576bdc 100644
--- a/programs/develop/libraries/pixman/pixman-combine32.c
+++ b/programs/develop/libraries/pixman/pixman-combine32.c
@@ -1,7 +1,26 @@
-/* WARNING: This file is generated by combine.pl from combine.inc.
-   Please edit one of those files rather than this one. */
-
-#line 1 "pixman-combine.c.template"
+/*
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -10,10 +29,9 @@
 #include <string.h>
 
 #include "pixman-private.h"
-
 #include "pixman-combine32.h"
 
-/*** per channel helper functions ***/
+/* component alpha helper functions */
 
 static void
 combine_mask_ca (uint32_t *src, uint32_t *mask)
@@ -95,15 +113,11 @@ combine_mask_alpha_ca (const uint32_t *src, uint32_t *mask)
 /*
  * There are two ways of handling alpha -- either as a single unified value or
  * a separate value for each component, hence each macro must have two
- * versions.  The unified alpha version has a 'U' at the end of the name,
- * the component version has a 'C'.  Similarly, functions which deal with
+ * versions.  The unified alpha version has a 'u' at the end of the name,
+ * the component version has a 'ca'.  Similarly, functions which deal with
  * this difference will have two versions using the same convention.
  */
 
-/*
- * All of the composing functions
- */
-
 static force_inline uint32_t
 combine_mask (const uint32_t *src, const uint32_t *mask, int i)
 {
@@ -158,7 +172,9 @@ combine_src_u (pixman_implementation_t *imp,
     int i;
 
     if (!mask)
+    {
 	memcpy (dest, src, width * sizeof (uint32_t));
+    }
     else
     {
 	for (i = 0; i < width; ++i)
@@ -170,7 +186,6 @@ combine_src_u (pixman_implementation_t *imp,
     }
 }
 
-/* if the Src is opaque, call combine_src_u */
 static void
 combine_over_u (pixman_implementation_t *imp,
                 pixman_op_t              op,
@@ -181,18 +196,61 @@ combine_over_u (pixman_implementation_t *imp,
 {
     int i;
 
-    for (i = 0; i < width; ++i)
+    if (!mask)
     {
-	uint32_t s = combine_mask (src, mask, i);
-	uint32_t d = *(dest + i);
-	uint32_t ia = ALPHA_8 (~s);
-
-	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
-	*(dest + i) = d;
+	for (i = 0; i < width; ++i)
+	{
+	    uint32_t s = *(src + i);
+	    uint32_t a = ALPHA_8 (s);
+	    if (a == 0xFF)
+	    {
+		*(dest + i) = s;
+	    }
+	    else if (s)
+	    {
+		uint32_t d = *(dest + i);
+		uint32_t ia = a ^ 0xFF;
+		UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+		*(dest + i) = d;
+	    }
+	}
+    }
+    else
+    {
+	for (i = 0; i < width; ++i)
+	{
+	    uint32_t m = ALPHA_8 (*(mask + i));
+	    if (m == 0xFF)
+	    {
+		uint32_t s = *(src + i);
+		uint32_t a = ALPHA_8 (s);
+		if (a == 0xFF)
+		{
+		    *(dest + i) = s;
+		}
+		else if (s)
+		{
+		    uint32_t d = *(dest + i);
+		    uint32_t ia = a ^ 0xFF;
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+		    *(dest + i) = d;
+		}
+	    }
+	    else if (m)
+	    {
+		uint32_t s = *(src + i);
+		if (s)
+		{
+		    uint32_t d = *(dest + i);
+		    UN8x4_MUL_UN8 (s, m);
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, ALPHA_8 (~s), s);
+		    *(dest + i) = d;
+		}
+	    }
+	}
     }
 }
 
-/* if the Dst is opaque, this is a noop */
 static void
 combine_over_reverse_u (pixman_implementation_t *imp,
                         pixman_op_t              op,
@@ -213,7 +271,6 @@ combine_over_reverse_u (pixman_implementation_t *imp,
     }
 }
 
-/* if the Dst is opaque, call combine_src_u */
 static void
 combine_in_u (pixman_implementation_t *imp,
               pixman_op_t              op,
@@ -233,7 +290,6 @@ combine_in_u (pixman_implementation_t *imp,
     }
 }
 
-/* if the Src is opaque, this is a noop */
 static void
 combine_in_reverse_u (pixman_implementation_t *imp,
                       pixman_op_t              op,
@@ -254,7 +310,6 @@ combine_in_reverse_u (pixman_implementation_t *imp,
     }
 }
 
-/* if the Dst is opaque, call combine_clear */
 static void
 combine_out_u (pixman_implementation_t *imp,
                pixman_op_t              op,
@@ -274,7 +329,6 @@ combine_out_u (pixman_implementation_t *imp,
     }
 }
 
-/* if the Src is opaque, call combine_clear */
 static void
 combine_out_reverse_u (pixman_implementation_t *imp,
                        pixman_op_t              op,
@@ -295,9 +349,6 @@ combine_out_reverse_u (pixman_implementation_t *imp,
     }
 }
 
-/* if the Src is opaque, call combine_in_u */
-/* if the Dst is opaque, call combine_over_u */
-/* if both the Src and Dst are opaque, call combine_src_u */
 static void
 combine_atop_u (pixman_implementation_t *imp,
                 pixman_op_t              op,
@@ -320,9 +371,6 @@ combine_atop_u (pixman_implementation_t *imp,
     }
 }
 
-/* if the Src is opaque, call combine_over_reverse_u */
-/* if the Dst is opaque, call combine_in_reverse_u */
-/* if both the Src and Dst are opaque, call combine_dst_u */
 static void
 combine_atop_reverse_u (pixman_implementation_t *imp,
                         pixman_op_t              op,
@@ -345,9 +393,6 @@ combine_atop_reverse_u (pixman_implementation_t *imp,
     }
 }
 
-/* if the Src is opaque, call combine_over_u */
-/* if the Dst is opaque, call combine_over_reverse_u */
-/* if both the Src and Dst are opaque, call combine_clear */
 static void
 combine_xor_u (pixman_implementation_t *imp,
                pixman_op_t              op,
@@ -389,9 +434,6 @@ combine_add_u (pixman_implementation_t *imp,
     }
 }
 
-/* if the Src is opaque, call combine_add_u */
-/* if the Dst is opaque, call combine_add_u */
-/* if both the Src and Dst are opaque, call combine_add_u */
 static void
 combine_saturate_u (pixman_implementation_t *imp,
                     pixman_op_t              op,
@@ -441,14 +483,13 @@ combine_saturate_u (pixman_implementation_t *imp,
  * PDF_NON_SEPARABLE_BLEND_MODE macros, which take the blend function as an
  * argument. Note that this implementation operates on premultiplied colors,
  * while the PDF specification does not. Therefore the code uses the formula
- * ar.Cra = (1 – as) . Dca + (1 – ad) . Sca + B(Dca, ad, Sca, as)
+ * Cra = (1 – as) . Dca + (1 – ad) . Sca + B(Dca, ad, Sca, as)
  */
 
 /*
  * Multiply
  * B(Dca, ad, Sca, as) = Dca.Sca
  */
-
 static void
 combine_multiply_u (pixman_implementation_t *imp,
                     pixman_op_t              op,
@@ -493,7 +534,7 @@ combine_multiply_ca (pixman_implementation_t *imp,
 	uint32_t r = d;
 	uint32_t dest_ia = ALPHA_8 (~d);
 
-	combine_mask_value_ca (&s, &m);
+	combine_mask_ca (&s, &m);
 
 	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (r, ~m, s, dest_ia);
 	UN8x4_MUL_UN8x4 (d, s);
@@ -526,7 +567,7 @@ combine_multiply_ca (pixman_implementation_t *imp,
 	    UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (result, isa, s, ida);	\
 	    								\
 	    *(dest + i) = result +					\
-		(DIV_ONE_UN8 (sa * da) << A_SHIFT) +			\
+		(DIV_ONE_UN8 (sa * (uint32_t)da) << A_SHIFT) +		\
 		(blend_ ## name (RED_8 (d), da, RED_8 (s), sa) << R_SHIFT) + \
 		(blend_ ## name (GREEN_8 (d), da, GREEN_8 (s), sa) << G_SHIFT) + \
 		(blend_ ## name (BLUE_8 (d), da, BLUE_8 (s), sa));	\
@@ -550,13 +591,13 @@ combine_multiply_ca (pixman_implementation_t *imp,
 	    uint8_t ida = ~da;						\
 	    uint32_t result;						\
             								\
-	    combine_mask_value_ca (&s, &m);				\
+	    combine_mask_ca (&s, &m);					\
             								\
 	    result = d;							\
 	    UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (result, ~m, s, ida);     \
             								\
 	    result +=							\
-	        (DIV_ONE_UN8 (ALPHA_8 (m) * da) << A_SHIFT) +		\
+	        (DIV_ONE_UN8 (ALPHA_8 (m) * (uint32_t)da) << A_SHIFT) +	\
 	        (blend_ ## name (RED_8 (d), da, RED_8 (s), RED_8 (m)) << R_SHIFT) + \
 	        (blend_ ## name (GREEN_8 (d), da, GREEN_8 (s), GREEN_8 (m)) << G_SHIFT) + \
 	        (blend_ ## name (BLUE_8 (d), da, BLUE_8 (s), BLUE_8 (m))); \
@@ -853,7 +894,7 @@ PDF_SEPARABLE_BLEND_MODE (exclusion)
  *
  *    r * set_sat (C, s) = set_sat (x * C, r * s)
  *
- * The above holds for all non-zero x, because they x'es in the fraction for
+ * The above holds for all non-zero x, because the x'es in the fraction for
  * C_mid cancel out. Specifically, it holds for x = r:
  *
  *    r * set_sat (C, s) = set_sat (r_c, rs)
@@ -889,8 +930,7 @@ PDF_SEPARABLE_BLEND_MODE (exclusion)
  *
  *     a_s * a_d * B(s, d)
  *   = a_s * a_d * set_lum (set_sat (S/a_s, SAT (D/a_d)), LUM (D/a_d), 1)
- *   = a_s * a_d * set_lum (set_sat (a_d * S, a_s * SAT (D)),
- *                                        a_s * LUM (D), a_s * a_d)
+ *   = set_lum (set_sat (a_d * S, a_s * SAT (D)), a_s * LUM (D), a_s * a_d)
  *
  */
 
@@ -931,7 +971,7 @@ PDF_SEPARABLE_BLEND_MODE (exclusion)
 	    blend_ ## name (c, dc, da, sc, sa);				\
             								\
 	    *(dest + i) = result +					\
-		(DIV_ONE_UN8 (sa * da) << A_SHIFT) +			\
+		(DIV_ONE_UN8 (sa * (uint32_t)da) << A_SHIFT) +		\
 		(DIV_ONE_UN8 (c[0]) << R_SHIFT) +			\
 		(DIV_ONE_UN8 (c[1]) << G_SHIFT) +			\
 		(DIV_ONE_UN8 (c[2]));					\
@@ -1148,9 +1188,7 @@ PDF_NON_SEPARABLE_BLEND_MODE (hsl_luminosity)
 #undef CH_MIN
 #undef PDF_NON_SEPARABLE_BLEND_MODE
 
-/* Overlay
- *
- * All of the disjoint composing functions
+/* All of the disjoint/conjoint composing functions
  *
  * The four entries in the first column indicate what source contributions
  * come from each of the four areas of the picture -- areas covered by neither
@@ -1171,6 +1209,9 @@ PDF_NON_SEPARABLE_BLEND_MODE (hsl_luminosity)
  * (0,0,B,A)	max(1-(1-b)/a,0) min(1,(1-a)/b)	 min(1,b/a)	max(1-a/b,0)
  * (0,A,0,B)	min(1,(1-b)/a)	max(1-(1-a)/b,0) max(1-b/a,0)	min(1,a/b)
  * (0,A,B,0)	min(1,(1-b)/a)	min(1,(1-a)/b)	max(1-b/a,0)	max(1-a/b,0)
+ *
+ * See  http://marc.info/?l=xfree-render&m=99792000027857&w=2  for more
+ * information about these operators.
  */
 
 #define COMBINE_A_OUT 1
@@ -1583,9 +1624,8 @@ combine_conjoint_xor_u (pixman_implementation_t *imp,
     combine_conjoint_general_u (dest, src, mask, width, COMBINE_XOR);
 }
 
-/************************************************************************/
-/*********************** Per Channel functions **************************/
-/************************************************************************/
+
+/* Component alpha combiners */
 
 static void
 combine_clear_ca (pixman_implementation_t *imp,
@@ -2462,4 +2502,3 @@ _pixman_setup_combiner_functions_32 (pixman_implementation_t *imp)
     imp->combine_32_ca[PIXMAN_OP_HSL_COLOR] = combine_dst;
     imp->combine_32_ca[PIXMAN_OP_HSL_LUMINOSITY] = combine_dst;
 }
-
diff --git a/programs/develop/libraries/pixman/pixman-combine32.h b/programs/develop/libraries/pixman/pixman-combine32.h
index 68bde4281a..cdd56a61a1 100644
--- a/programs/develop/libraries/pixman/pixman-combine32.h
+++ b/programs/develop/libraries/pixman/pixman-combine32.h
@@ -1,8 +1,3 @@
-/* WARNING: This file is generated by combine.pl from combine.inc.
-   Please edit one of those files rather than this one. */
-
-#line 1 "pixman-combine.c.template"
-
 #define COMPONENT_SIZE 8
 #define MASK 0xff
 #define ONE_HALF 0x80
@@ -24,19 +19,62 @@
 #define GREEN_8(x) (((x) >> G_SHIFT) & MASK)
 #define BLUE_8(x) ((x) & MASK)
 
+/*
+ * ARMv6 has UQADD8 instruction, which implements unsigned saturated
+ * addition for 8-bit values packed in 32-bit registers. It is very useful
+ * for UN8x4_ADD_UN8x4, UN8_rb_ADD_UN8_rb and ADD_UN8 macros (which would
+ * otherwise need a lot of arithmetic operations to simulate this operation).
+ * Since most of the major ARM linux distros are built for ARMv7, we are
+ * much less dependent on runtime CPU detection and can get practical
+ * benefits from conditional compilation here for a lot of users.
+ */
+
+#if defined(USE_GCC_INLINE_ASM) && defined(__arm__) && \
+    !defined(__aarch64__) && (!defined(__thumb__) || defined(__thumb2__))
+#if defined(__ARM_ARCH_6__)   || defined(__ARM_ARCH_6J__)  || \
+    defined(__ARM_ARCH_6K__)  || defined(__ARM_ARCH_6Z__)  || \
+    defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) || \
+    defined(__ARM_ARCH_6M__)  || defined(__ARM_ARCH_7__)   || \
+    defined(__ARM_ARCH_7A__)  || defined(__ARM_ARCH_7R__)  || \
+    defined(__ARM_ARCH_7M__)  || defined(__ARM_ARCH_7EM__)
+
+static force_inline uint32_t
+un8x4_add_un8x4 (uint32_t x, uint32_t y)
+{
+    uint32_t t;
+    asm ("uqadd8 %0, %1, %2" : "=r" (t) : "%r" (x), "r" (y));
+    return t;
+}
+
+#define UN8x4_ADD_UN8x4(x, y) \
+    ((x) = un8x4_add_un8x4 ((x), (y)))
+
+#define UN8_rb_ADD_UN8_rb(x, y, t) \
+    ((t) = un8x4_add_un8x4 ((x), (y)), (x) = (t))
+
+#define ADD_UN8(x, y, t) \
+    ((t) = (x), un8x4_add_un8x4 ((t), (y)))
+
+#endif
+#endif
+
+/*****************************************************************************/
+
 /*
  * Helper macros.
  */
 
 #define MUL_UN8(a, b, t)						\
-    ((t) = (a) * (b) + ONE_HALF, ((((t) >> G_SHIFT ) + (t) ) >> G_SHIFT ))
+    ((t) = (a) * (uint16_t)(b) + ONE_HALF, ((((t) >> G_SHIFT ) + (t) ) >> G_SHIFT ))
 
 #define DIV_UN8(a, b)							\
-    (((uint16_t) (a) * MASK) / (b))
+    (((uint16_t) (a) * MASK + ((b) / 2)) / (b))
 
+#ifndef ADD_UN8
 #define ADD_UN8(x, y, t)				     \
     ((t) = (x) + (y),					     \
      (uint32_t) (uint8_t) ((t) | (0 - ((t) >> G_SHIFT))))
+#endif
 
 #define DIV_ONE_UN8(x)							\
     (((x) + ONE_HALF + (((x) + ONE_HALF) >> G_SHIFT)) >> G_SHIFT)
@@ -61,6 +99,7 @@
 /*
  * x_rb = min (x_rb + y_rb, 255)
  */
+#ifndef UN8_rb_ADD_UN8_rb
 #define UN8_rb_ADD_UN8_rb(x, y, t)					\
     do									\
     {									\
@@ -68,6 +107,7 @@
 	t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK);		\
 	x = (t & RB_MASK);						\
     } while (0)
+#endif
 
 /*
  * x_rb = (x_rb * a_rb) / 255
@@ -213,6 +253,7 @@
 /*
   x_c = min(x_c + y_c, 255)
 */
+#ifndef UN8x4_ADD_UN8x4
 #define UN8x4_ADD_UN8x4(x, y)						\
     do									\
     {									\
@@ -228,3 +269,4 @@
 									\
 	x = r1__ | (r2__ << G_SHIFT);					\
     } while (0)
+#endif
diff --git a/programs/develop/libraries/pixman/pixman-combine64.c b/programs/develop/libraries/pixman/pixman-combine64.c
deleted file mode 100644
index 850afba6ad..0000000000
--- a/programs/develop/libraries/pixman/pixman-combine64.c
+++ /dev/null
@@ -1,2465 +0,0 @@
-/* WARNING: This file is generated by combine.pl from combine.inc.
-   Please edit one of those files rather than this one. */
-
-#line 1 "pixman-combine.c.template"
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <math.h>
-#include <string.h>
-
-#include "pixman-private.h"
-
-#include "pixman-combine64.h"
-
-/*** per channel helper functions ***/
-
-static void
-combine_mask_ca (uint64_t *src, uint64_t *mask)
-{
-    uint64_t a = *mask;
-
-    uint64_t x;
-    uint32_t xa;
-
-    if (!a)
-    {
-	*(src) = 0;
-	return;
-    }
-
-    x = *(src);
-    if (a == ~0)
-    {
-	x = x >> A_SHIFT;
-	x |= x << G_SHIFT;
-	x |= x << R_SHIFT;
-	*(mask) = x;
-	return;
-    }
-
-    xa = x >> A_SHIFT;
-    UN16x4_MUL_UN16x4 (x, a);
-    *(src) = x;
-    
-    UN16x4_MUL_UN16 (a, xa);
-    *(mask) = a;
-}
-
-static void
-combine_mask_value_ca (uint64_t *src, const uint64_t *mask)
-{
-    uint64_t a = *mask;
-    uint64_t x;
-
-    if (!a)
-    {
-	*(src) = 0;
-	return;
-    }
-
-    if (a == ~0)
-	return;
-
-    x = *(src);
-    UN16x4_MUL_UN16x4 (x, a);
-    *(src) = x;
-}
-
-static void
-combine_mask_alpha_ca (const uint64_t *src, uint64_t *mask)
-{
-    uint64_t a = *(mask);
-    uint64_t x;
-
-    if (!a)
-	return;
-
-    x = *(src) >> A_SHIFT;
-    if (x == MASK)
-	return;
-
-    if (a == ~0)
-    {
-	x |= x << G_SHIFT;
-	x |= x << R_SHIFT;
-	*(mask) = x;
-	return;
-    }
-
-    UN16x4_MUL_UN16 (a, x);
-    *(mask) = a;
-}
-
-/*
- * There are two ways of handling alpha -- either as a single unified value or
- * a separate value for each component, hence each macro must have two
- * versions.  The unified alpha version has a 'U' at the end of the name,
- * the component version has a 'C'.  Similarly, functions which deal with
- * this difference will have two versions using the same convention.
- */
-
-/*
- * All of the composing functions
- */
-
-static force_inline uint64_t
-combine_mask (const uint64_t *src, const uint64_t *mask, int i)
-{
-    uint64_t s, m;
-
-    if (mask)
-    {
-	m = *(mask + i) >> A_SHIFT;
-
-	if (!m)
-	    return 0;
-    }
-
-    s = *(src + i);
-
-    if (mask)
-	UN16x4_MUL_UN16 (s, m);
-
-    return s;
-}
-
-static void
-combine_clear (pixman_implementation_t *imp,
-               pixman_op_t              op,
-               uint64_t *                dest,
-               const uint64_t *          src,
-               const uint64_t *          mask,
-               int                      width)
-{
-    memset (dest, 0, width * sizeof(uint64_t));
-}
-
-static void
-combine_dst (pixman_implementation_t *imp,
-	     pixman_op_t	      op,
-	     uint64_t *		      dest,
-	     const uint64_t *	      src,
-	     const uint64_t *          mask,
-	     int		      width)
-{
-    return;
-}
-
-static void
-combine_src_u (pixman_implementation_t *imp,
-               pixman_op_t              op,
-               uint64_t *                dest,
-               const uint64_t *          src,
-               const uint64_t *          mask,
-               int                      width)
-{
-    int i;
-
-    if (!mask)
-	memcpy (dest, src, width * sizeof (uint64_t));
-    else
-    {
-	for (i = 0; i < width; ++i)
-	{
-	    uint64_t s = combine_mask (src, mask, i);
-
-	    *(dest + i) = s;
-	}
-    }
-}
-
-/* if the Src is opaque, call combine_src_u */
-static void
-combine_over_u (pixman_implementation_t *imp,
-                pixman_op_t              op,
-                uint64_t *                dest,
-                const uint64_t *          src,
-                const uint64_t *          mask,
-                int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t s = combine_mask (src, mask, i);
-	uint64_t d = *(dest + i);
-	uint64_t ia = ALPHA_16 (~s);
-
-	UN16x4_MUL_UN16_ADD_UN16x4 (d, ia, s);
-	*(dest + i) = d;
-    }
-}
-
-/* if the Dst is opaque, this is a noop */
-static void
-combine_over_reverse_u (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        uint64_t *                dest,
-                        const uint64_t *          src,
-                        const uint64_t *          mask,
-                        int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t s = combine_mask (src, mask, i);
-	uint64_t d = *(dest + i);
-	uint64_t ia = ALPHA_16 (~*(dest + i));
-	UN16x4_MUL_UN16_ADD_UN16x4 (s, ia, d);
-	*(dest + i) = s;
-    }
-}
-
-/* if the Dst is opaque, call combine_src_u */
-static void
-combine_in_u (pixman_implementation_t *imp,
-              pixman_op_t              op,
-              uint64_t *                dest,
-              const uint64_t *          src,
-              const uint64_t *          mask,
-              int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t s = combine_mask (src, mask, i);
-	uint64_t a = ALPHA_16 (*(dest + i));
-	UN16x4_MUL_UN16 (s, a);
-	*(dest + i) = s;
-    }
-}
-
-/* if the Src is opaque, this is a noop */
-static void
-combine_in_reverse_u (pixman_implementation_t *imp,
-                      pixman_op_t              op,
-                      uint64_t *                dest,
-                      const uint64_t *          src,
-                      const uint64_t *          mask,
-                      int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t s = combine_mask (src, mask, i);
-	uint64_t d = *(dest + i);
-	uint64_t a = ALPHA_16 (s);
-	UN16x4_MUL_UN16 (d, a);
-	*(dest + i) = d;
-    }
-}
-
-/* if the Dst is opaque, call combine_clear */
-static void
-combine_out_u (pixman_implementation_t *imp,
-               pixman_op_t              op,
-               uint64_t *                dest,
-               const uint64_t *          src,
-               const uint64_t *          mask,
-               int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t s = combine_mask (src, mask, i);
-	uint64_t a = ALPHA_16 (~*(dest + i));
-	UN16x4_MUL_UN16 (s, a);
-	*(dest + i) = s;
-    }
-}
-
-/* if the Src is opaque, call combine_clear */
-static void
-combine_out_reverse_u (pixman_implementation_t *imp,
-                       pixman_op_t              op,
-                       uint64_t *                dest,
-                       const uint64_t *          src,
-                       const uint64_t *          mask,
-                       int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t s = combine_mask (src, mask, i);
-	uint64_t d = *(dest + i);
-	uint64_t a = ALPHA_16 (~s);
-	UN16x4_MUL_UN16 (d, a);
-	*(dest + i) = d;
-    }
-}
-
-/* if the Src is opaque, call combine_in_u */
-/* if the Dst is opaque, call combine_over_u */
-/* if both the Src and Dst are opaque, call combine_src_u */
-static void
-combine_atop_u (pixman_implementation_t *imp,
-                pixman_op_t              op,
-                uint64_t *                dest,
-                const uint64_t *          src,
-                const uint64_t *          mask,
-                int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t s = combine_mask (src, mask, i);
-	uint64_t d = *(dest + i);
-	uint64_t dest_a = ALPHA_16 (d);
-	uint64_t src_ia = ALPHA_16 (~s);
-
-	UN16x4_MUL_UN16_ADD_UN16x4_MUL_UN16 (s, dest_a, d, src_ia);
-	*(dest + i) = s;
-    }
-}
-
-/* if the Src is opaque, call combine_over_reverse_u */
-/* if the Dst is opaque, call combine_in_reverse_u */
-/* if both the Src and Dst are opaque, call combine_dst_u */
-static void
-combine_atop_reverse_u (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        uint64_t *                dest,
-                        const uint64_t *          src,
-                        const uint64_t *          mask,
-                        int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t s = combine_mask (src, mask, i);
-	uint64_t d = *(dest + i);
-	uint64_t src_a = ALPHA_16 (s);
-	uint64_t dest_ia = ALPHA_16 (~d);
-
-	UN16x4_MUL_UN16_ADD_UN16x4_MUL_UN16 (s, dest_ia, d, src_a);
-	*(dest + i) = s;
-    }
-}
-
-/* if the Src is opaque, call combine_over_u */
-/* if the Dst is opaque, call combine_over_reverse_u */
-/* if both the Src and Dst are opaque, call combine_clear */
-static void
-combine_xor_u (pixman_implementation_t *imp,
-               pixman_op_t              op,
-               uint64_t *                dest,
-               const uint64_t *          src,
-               const uint64_t *          mask,
-               int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t s = combine_mask (src, mask, i);
-	uint64_t d = *(dest + i);
-	uint64_t src_ia = ALPHA_16 (~s);
-	uint64_t dest_ia = ALPHA_16 (~d);
-
-	UN16x4_MUL_UN16_ADD_UN16x4_MUL_UN16 (s, dest_ia, d, src_ia);
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_add_u (pixman_implementation_t *imp,
-               pixman_op_t              op,
-               uint64_t *                dest,
-               const uint64_t *          src,
-               const uint64_t *          mask,
-               int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t s = combine_mask (src, mask, i);
-	uint64_t d = *(dest + i);
-	UN16x4_ADD_UN16x4 (d, s);
-	*(dest + i) = d;
-    }
-}
-
-/* if the Src is opaque, call combine_add_u */
-/* if the Dst is opaque, call combine_add_u */
-/* if both the Src and Dst are opaque, call combine_add_u */
-static void
-combine_saturate_u (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    uint64_t *                dest,
-                    const uint64_t *          src,
-                    const uint64_t *          mask,
-                    int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t s = combine_mask (src, mask, i);
-	uint64_t d = *(dest + i);
-	uint32_t sa, da;
-
-	sa = s >> A_SHIFT;
-	da = ~d >> A_SHIFT;
-	if (sa > da)
-	{
-	    sa = DIV_UN16 (da, sa);
-	    UN16x4_MUL_UN16 (s, sa);
-	}
-	;
-	UN16x4_ADD_UN16x4 (d, s);
-	*(dest + i) = d;
-    }
-}
-
-/*
- * PDF blend modes:
- * The following blend modes have been taken from the PDF ISO 32000
- * specification, which at this point in time is available from
- * http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf
- * The relevant chapters are 11.3.5 and 11.3.6.
- * The formula for computing the final pixel color given in 11.3.6 is:
- * αr × Cr = (1 – αs) × αb × Cb + (1 – αb) × αs × Cs + αb × αs × B(Cb, Cs)
- * with B() being the blend function.
- * Note that OVER is a special case of this operation, using B(Cb, Cs) = Cs
- *
- * These blend modes should match the SVG filter draft specification, as
- * it has been designed to mirror ISO 32000. Note that at the current point
- * no released draft exists that shows this, as the formulas have not been
- * updated yet after the release of ISO 32000.
- *
- * The default implementation here uses the PDF_SEPARABLE_BLEND_MODE and
- * PDF_NON_SEPARABLE_BLEND_MODE macros, which take the blend function as an
- * argument. Note that this implementation operates on premultiplied colors,
- * while the PDF specification does not. Therefore the code uses the formula
- * ar.Cra = (1 – as) . Dca + (1 – ad) . Sca + B(Dca, ad, Sca, as)
- */
-
-/*
- * Multiply
- * B(Dca, ad, Sca, as) = Dca.Sca
- */
-
-static void
-combine_multiply_u (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    uint64_t *                dest,
-                    const uint64_t *          src,
-                    const uint64_t *          mask,
-                    int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t s = combine_mask (src, mask, i);
-	uint64_t d = *(dest + i);
-	uint64_t ss = s;
-	uint64_t src_ia = ALPHA_16 (~s);
-	uint64_t dest_ia = ALPHA_16 (~d);
-
-	UN16x4_MUL_UN16_ADD_UN16x4_MUL_UN16 (ss, dest_ia, d, src_ia);
-	UN16x4_MUL_UN16x4 (d, s);
-	UN16x4_ADD_UN16x4 (d, ss);
-
-	*(dest + i) = d;
-    }
-}
-
-static void
-combine_multiply_ca (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint64_t *                dest,
-                     const uint64_t *          src,
-                     const uint64_t *          mask,
-                     int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t m = *(mask + i);
-	uint64_t s = *(src + i);
-	uint64_t d = *(dest + i);
-	uint64_t r = d;
-	uint64_t dest_ia = ALPHA_16 (~d);
-
-	combine_mask_value_ca (&s, &m);
-
-	UN16x4_MUL_UN16x4_ADD_UN16x4_MUL_UN16 (r, ~m, s, dest_ia);
-	UN16x4_MUL_UN16x4 (d, s);
-	UN16x4_ADD_UN16x4 (r, d);
-
-	*(dest + i) = r;
-    }
-}
-
-#define PDF_SEPARABLE_BLEND_MODE(name)					\
-    static void								\
-    combine_ ## name ## _u (pixman_implementation_t *imp,		\
-			    pixman_op_t              op,		\
-                            uint64_t *                dest,		\
-			    const uint64_t *          src,		\
-			    const uint64_t *          mask,		\
-			    int                      width)		\
-    {									\
-	int i;								\
-	for (i = 0; i < width; ++i) {					\
-	    uint64_t s = combine_mask (src, mask, i);			\
-	    uint64_t d = *(dest + i);					\
-	    uint16_t sa = ALPHA_16 (s);					\
-	    uint16_t isa = ~sa;						\
-	    uint16_t da = ALPHA_16 (d);					\
-	    uint16_t ida = ~da;						\
-	    uint64_t result;						\
-									\
-	    result = d;							\
-	    UN16x4_MUL_UN16_ADD_UN16x4_MUL_UN16 (result, isa, s, ida);	\
-	    								\
-	    *(dest + i) = result +					\
-		(DIV_ONE_UN16 (sa * da) << A_SHIFT) +			\
-		(blend_ ## name (RED_16 (d), da, RED_16 (s), sa) << R_SHIFT) + \
-		(blend_ ## name (GREEN_16 (d), da, GREEN_16 (s), sa) << G_SHIFT) + \
-		(blend_ ## name (BLUE_16 (d), da, BLUE_16 (s), sa));	\
-	}								\
-    }									\
-    									\
-    static void								\
-    combine_ ## name ## _ca (pixman_implementation_t *imp,		\
-			     pixman_op_t              op,		\
-                             uint64_t *                dest,		\
-			     const uint64_t *          src,		\
-			     const uint64_t *          mask,		\
-			     int                     width)		\
-    {									\
-	int i;								\
-	for (i = 0; i < width; ++i) {					\
-	    uint64_t m = *(mask + i);					\
-	    uint64_t s = *(src + i);					\
-	    uint64_t d = *(dest + i);					\
-	    uint16_t da = ALPHA_16 (d);					\
-	    uint16_t ida = ~da;						\
-	    uint64_t result;						\
-            								\
-	    combine_mask_value_ca (&s, &m);				\
-            								\
-	    result = d;							\
-	    UN16x4_MUL_UN16x4_ADD_UN16x4_MUL_UN16 (result, ~m, s, ida);     \
-            								\
-	    result +=							\
-	        (DIV_ONE_UN16 (ALPHA_16 (m) * da) << A_SHIFT) +		\
-	        (blend_ ## name (RED_16 (d), da, RED_16 (s), RED_16 (m)) << R_SHIFT) + \
-	        (blend_ ## name (GREEN_16 (d), da, GREEN_16 (s), GREEN_16 (m)) << G_SHIFT) + \
-	        (blend_ ## name (BLUE_16 (d), da, BLUE_16 (s), BLUE_16 (m))); \
-	    								\
-	    *(dest + i) = result;					\
-	}								\
-    }
-
-/*
- * Screen
- * B(Dca, ad, Sca, as) = Dca.sa + Sca.da - Dca.Sca
- */
-static inline uint64_t
-blend_screen (uint64_t dca, uint64_t da, uint64_t sca, uint64_t sa)
-{
-    return DIV_ONE_UN16 (sca * da + dca * sa - sca * dca);
-}
-
-PDF_SEPARABLE_BLEND_MODE (screen)
-
-/*
- * Overlay
- * B(Dca, Da, Sca, Sa) =
- *   if 2.Dca < Da
- *     2.Sca.Dca
- *   otherwise
- *     Sa.Da - 2.(Da - Dca).(Sa - Sca)
- */
-static inline uint64_t
-blend_overlay (uint64_t dca, uint64_t da, uint64_t sca, uint64_t sa)
-{
-    uint64_t rca;
-
-    if (2 * dca < da)
-	rca = 2 * sca * dca;
-    else
-	rca = sa * da - 2 * (da - dca) * (sa - sca);
-    return DIV_ONE_UN16 (rca);
-}
-
-PDF_SEPARABLE_BLEND_MODE (overlay)
-
-/*
- * Darken
- * B(Dca, Da, Sca, Sa) = min (Sca.Da, Dca.Sa)
- */
-static inline uint64_t
-blend_darken (uint64_t dca, uint64_t da, uint64_t sca, uint64_t sa)
-{
-    uint64_t s, d;
-
-    s = sca * da;
-    d = dca * sa;
-    return DIV_ONE_UN16 (s > d ? d : s);
-}
-
-PDF_SEPARABLE_BLEND_MODE (darken)
-
-/*
- * Lighten
- * B(Dca, Da, Sca, Sa) = max (Sca.Da, Dca.Sa)
- */
-static inline uint64_t
-blend_lighten (uint64_t dca, uint64_t da, uint64_t sca, uint64_t sa)
-{
-    uint64_t s, d;
-
-    s = sca * da;
-    d = dca * sa;
-    return DIV_ONE_UN16 (s > d ? s : d);
-}
-
-PDF_SEPARABLE_BLEND_MODE (lighten)
-
-/*
- * Color dodge
- * B(Dca, Da, Sca, Sa) =
- *   if Dca == 0
- *     0
- *   if Sca == Sa
- *     Sa.Da
- *   otherwise
- *     Sa.Da. min (1, Dca / Da / (1 - Sca/Sa))
- */
-static inline uint64_t
-blend_color_dodge (uint64_t dca, uint64_t da, uint64_t sca, uint64_t sa)
-{
-    if (sca >= sa)
-    {
-	return dca == 0 ? 0 : DIV_ONE_UN16 (sa * da);
-    }
-    else
-    {
-	uint64_t rca = dca * sa / (sa - sca);
-	return DIV_ONE_UN16 (sa * MIN (rca, da));
-    }
-}
-
-PDF_SEPARABLE_BLEND_MODE (color_dodge)
-
-/*
- * Color burn
- * B(Dca, Da, Sca, Sa) =
- *   if Dca == Da
- *     Sa.Da
- *   if Sca == 0
- *     0
- *   otherwise
- *     Sa.Da.(1 - min (1, (1 - Dca/Da).Sa / Sca))
- */
-static inline uint64_t
-blend_color_burn (uint64_t dca, uint64_t da, uint64_t sca, uint64_t sa)
-{
-    if (sca == 0)
-    {
-	return dca < da ? 0 : DIV_ONE_UN16 (sa * da);
-    }
-    else
-    {
-	uint64_t rca = (da - dca) * sa / sca;
-	return DIV_ONE_UN16 (sa * (MAX (rca, da) - rca));
-    }
-}
-
-PDF_SEPARABLE_BLEND_MODE (color_burn)
-
-/*
- * Hard light
- * B(Dca, Da, Sca, Sa) =
- *   if 2.Sca < Sa
- *     2.Sca.Dca
- *   otherwise
- *     Sa.Da - 2.(Da - Dca).(Sa - Sca)
- */
-static inline uint64_t
-blend_hard_light (uint64_t dca, uint64_t da, uint64_t sca, uint64_t sa)
-{
-    if (2 * sca < sa)
-	return DIV_ONE_UN16 (2 * sca * dca);
-    else
-	return DIV_ONE_UN16 (sa * da - 2 * (da - dca) * (sa - sca));
-}
-
-PDF_SEPARABLE_BLEND_MODE (hard_light)
-
-/*
- * Soft light
- * B(Dca, Da, Sca, Sa) =
- *   if (2.Sca <= Sa)
- *     Dca.(Sa - (1 - Dca/Da).(2.Sca - Sa))
- *   otherwise if Dca.4 <= Da
- *     Dca.(Sa + (2.Sca - Sa).((16.Dca/Da - 12).Dca/Da + 3)
- *   otherwise
- *     (Dca.Sa + (SQRT (Dca/Da).Da - Dca).(2.Sca - Sa))
- */
-static inline uint64_t
-blend_soft_light (uint64_t dca_org,
-		  uint64_t da_org,
-		  uint64_t sca_org,
-		  uint64_t sa_org)
-{
-    double dca = dca_org * (1.0 / MASK);
-    double da = da_org * (1.0 / MASK);
-    double sca = sca_org * (1.0 / MASK);
-    double sa = sa_org * (1.0 / MASK);
-    double rca;
-
-    if (2 * sca < sa)
-    {
-	if (da == 0)
-	    rca = dca * sa;
-	else
-	    rca = dca * sa - dca * (da - dca) * (sa - 2 * sca) / da;
-    }
-    else if (da == 0)
-    {
-	rca = 0;
-    }
-    else if (4 * dca <= da)
-    {
-	rca = dca * sa +
-	    (2 * sca - sa) * dca * ((16 * dca / da - 12) * dca / da + 3);
-    }
-    else
-    {
-	rca = dca * sa + (sqrt (dca * da) - dca) * (2 * sca - sa);
-    }
-    return rca * MASK + 0.5;
-}
-
-PDF_SEPARABLE_BLEND_MODE (soft_light)
-
-/*
- * Difference
- * B(Dca, Da, Sca, Sa) = abs (Dca.Sa - Sca.Da)
- */
-static inline uint64_t
-blend_difference (uint64_t dca, uint64_t da, uint64_t sca, uint64_t sa)
-{
-    uint64_t dcasa = dca * sa;
-    uint64_t scada = sca * da;
-
-    if (scada < dcasa)
-	return DIV_ONE_UN16 (dcasa - scada);
-    else
-	return DIV_ONE_UN16 (scada - dcasa);
-}
-
-PDF_SEPARABLE_BLEND_MODE (difference)
-
-/*
- * Exclusion
- * B(Dca, Da, Sca, Sa) = (Sca.Da + Dca.Sa - 2.Sca.Dca)
- */
-
-/* This can be made faster by writing it directly and not using
- * PDF_SEPARABLE_BLEND_MODE, but that's a performance optimization */
-
-static inline uint64_t
-blend_exclusion (uint64_t dca, uint64_t da, uint64_t sca, uint64_t sa)
-{
-    return DIV_ONE_UN16 (sca * da + dca * sa - 2 * dca * sca);
-}
-
-PDF_SEPARABLE_BLEND_MODE (exclusion)
-
-#undef PDF_SEPARABLE_BLEND_MODE
-
-/*
- * PDF nonseperable blend modes are implemented using the following functions
- * to operate in Hsl space, with Cmax, Cmid, Cmin referring to the max, mid
- * and min value of the red, green and blue components.
- *
- * LUM (C) = 0.3 × Cred + 0.59 × Cgreen + 0.11 × Cblue
- *
- * clip_color (C):
- *   l = LUM (C)
- *   min = Cmin
- *   max = Cmax
- *   if n < 0.0
- *     C = l + ( ( ( C – l ) × l ) ⁄ ( l – min ) )
- *   if x > 1.0
- *     C = l + ( ( ( C – l ) × ( 1 – l ) ) ⁄ ( max – l ) )
- *   return C
- *
- * set_lum (C, l):
- *   d = l – LUM (C)
- *   C += d
- *   return clip_color (C)
- *
- * SAT (C) = CH_MAX (C) - CH_MIN (C)
- *
- * set_sat (C, s):
- *  if Cmax > Cmin
- *    Cmid = ( ( ( Cmid – Cmin ) × s ) ⁄ ( Cmax – Cmin ) )
- *    Cmax = s
- *  else
- *    Cmid = Cmax = 0.0
- *  Cmin = 0.0
- *  return C
- */
-
-/* For premultiplied colors, we need to know what happens when C is
- * multiplied by a real number. LUM and SAT are linear:
- *
- *    LUM (r × C) = r × LUM (C)		SAT (r * C) = r * SAT (C)
- *
- * If we extend clip_color with an extra argument a and change
- *
- *        if x >= 1.0
- *
- * into
- *
- *        if x >= a
- *
- * then clip_color is also linear:
- *
- *    r * clip_color (C, a) = clip_color (r_c, ra);
- *
- * for positive r.
- *
- * Similarly, we can extend set_lum with an extra argument that is just passed
- * on to clip_color:
- *
- *   r * set_lum ( C, l, a)
- *
- *   = r × clip_color ( C + l - LUM (C), a)
- *
- *   = clip_color ( r * C + r × l - r * LUM (C), r * a)
- *
- *   = set_lum ( r * C, r * l, r * a)
- *
- * Finally, set_sat:
- *
- *    r * set_sat (C, s) = set_sat (x * C, r * s)
- *
- * The above holds for all non-zero x, because they x'es in the fraction for
- * C_mid cancel out. Specifically, it holds for x = r:
- *
- *    r * set_sat (C, s) = set_sat (r_c, rs)
- *
- */
-
-/* So, for the non-separable PDF blend modes, we have (using s, d for
- * non-premultiplied colors, and S, D for premultiplied:
- *
- *   Color:
- *
- *     a_s * a_d * B(s, d)
- *   = a_s * a_d * set_lum (S/a_s, LUM (D/a_d), 1)
- *   = set_lum (S * a_d, a_s * LUM (D), a_s * a_d)
- *
- *
- *   Luminosity:
- *
- *     a_s * a_d * B(s, d)
- *   = a_s * a_d * set_lum (D/a_d, LUM(S/a_s), 1)
- *   = set_lum (a_s * D, a_d * LUM(S), a_s * a_d)
- *
- *
- *   Saturation:
- *
- *     a_s * a_d * B(s, d)
- *   = a_s * a_d * set_lum (set_sat (D/a_d, SAT (S/a_s)), LUM (D/a_d), 1)
- *   = set_lum (a_s * a_d * set_sat (D/a_d, SAT (S/a_s)),
- *                                        a_s * LUM (D), a_s * a_d)
- *   = set_lum (set_sat (a_s * D, a_d * SAT (S), a_s * LUM (D), a_s * a_d))
- *
- *   Hue:
- *
- *     a_s * a_d * B(s, d)
- *   = a_s * a_d * set_lum (set_sat (S/a_s, SAT (D/a_d)), LUM (D/a_d), 1)
- *   = a_s * a_d * set_lum (set_sat (a_d * S, a_s * SAT (D)),
- *                                        a_s * LUM (D), a_s * a_d)
- *
- */
-
-#define CH_MIN(c) (c[0] < c[1] ? (c[0] < c[2] ? c[0] : c[2]) : (c[1] < c[2] ? c[1] : c[2]))
-#define CH_MAX(c) (c[0] > c[1] ? (c[0] > c[2] ? c[0] : c[2]) : (c[1] > c[2] ? c[1] : c[2]))
-#define LUM(c) ((c[0] * 30 + c[1] * 59 + c[2] * 11) / 100)
-#define SAT(c) (CH_MAX (c) - CH_MIN (c))
-
-#define PDF_NON_SEPARABLE_BLEND_MODE(name)				\
-    static void								\
-    combine_ ## name ## _u (pixman_implementation_t *imp,		\
-			    pixman_op_t op,				\
-                            uint64_t *dest,				\
-			    const uint64_t *src,				\
-			    const uint64_t *mask,			\
-			    int width)					\
-    {									\
-	int i;								\
-	for (i = 0; i < width; ++i)					\
-	{								\
-	    uint64_t s = combine_mask (src, mask, i);			\
-	    uint64_t d = *(dest + i);					\
-	    uint16_t sa = ALPHA_16 (s);					\
-	    uint16_t isa = ~sa;						\
-	    uint16_t da = ALPHA_16 (d);					\
-	    uint16_t ida = ~da;						\
-	    uint64_t result;						\
-	    uint64_t sc[3], dc[3], c[3];					\
-            								\
-	    result = d;							\
-	    UN16x4_MUL_UN16_ADD_UN16x4_MUL_UN16 (result, isa, s, ida);	\
-	    dc[0] = RED_16 (d);						\
-	    sc[0] = RED_16 (s);						\
-	    dc[1] = GREEN_16 (d);					\
-	    sc[1] = GREEN_16 (s);					\
-	    dc[2] = BLUE_16 (d);						\
-	    sc[2] = BLUE_16 (s);						\
-	    blend_ ## name (c, dc, da, sc, sa);				\
-            								\
-	    *(dest + i) = result +					\
-		(DIV_ONE_UN16 (sa * da) << A_SHIFT) +			\
-		(DIV_ONE_UN16 (c[0]) << R_SHIFT) +			\
-		(DIV_ONE_UN16 (c[1]) << G_SHIFT) +			\
-		(DIV_ONE_UN16 (c[2]));					\
-	}								\
-    }
-
-static void
-set_lum (uint64_t dest[3], uint64_t src[3], uint64_t sa, uint64_t lum)
-{
-    double a, l, min, max;
-    double tmp[3];
-
-    a = sa * (1.0 / MASK);
-
-    l = lum * (1.0 / MASK);
-    tmp[0] = src[0] * (1.0 / MASK);
-    tmp[1] = src[1] * (1.0 / MASK);
-    tmp[2] = src[2] * (1.0 / MASK);
-
-    l = l - LUM (tmp);
-    tmp[0] += l;
-    tmp[1] += l;
-    tmp[2] += l;
-
-    /* clip_color */
-    l = LUM (tmp);
-    min = CH_MIN (tmp);
-    max = CH_MAX (tmp);
-
-    if (min < 0)
-    {
-	if (l - min == 0.0)
-	{
-	    tmp[0] = 0;
-	    tmp[1] = 0;
-	    tmp[2] = 0;
-	}
-	else
-	{
-	    tmp[0] = l + (tmp[0] - l) * l / (l - min);
-	    tmp[1] = l + (tmp[1] - l) * l / (l - min);
-	    tmp[2] = l + (tmp[2] - l) * l / (l - min);
-	}
-    }
-    if (max > a)
-    {
-	if (max - l == 0.0)
-	{
-	    tmp[0] = a;
-	    tmp[1] = a;
-	    tmp[2] = a;
-	}
-	else
-	{
-	    tmp[0] = l + (tmp[0] - l) * (a - l) / (max - l);
-	    tmp[1] = l + (tmp[1] - l) * (a - l) / (max - l);
-	    tmp[2] = l + (tmp[2] - l) * (a - l) / (max - l);
-	}
-    }
-
-    dest[0] = tmp[0] * MASK + 0.5;
-    dest[1] = tmp[1] * MASK + 0.5;
-    dest[2] = tmp[2] * MASK + 0.5;
-}
-
-static void
-set_sat (uint64_t dest[3], uint64_t src[3], uint64_t sat)
-{
-    int id[3];
-    uint64_t min, max;
-
-    if (src[0] > src[1])
-    {
-	if (src[0] > src[2])
-	{
-	    id[0] = 0;
-	    if (src[1] > src[2])
-	    {
-		id[1] = 1;
-		id[2] = 2;
-	    }
-	    else
-	    {
-		id[1] = 2;
-		id[2] = 1;
-	    }
-	}
-	else
-	{
-	    id[0] = 2;
-	    id[1] = 0;
-	    id[2] = 1;
-	}
-    }
-    else
-    {
-	if (src[0] > src[2])
-	{
-	    id[0] = 1;
-	    id[1] = 0;
-	    id[2] = 2;
-	}
-	else
-	{
-	    id[2] = 0;
-	    if (src[1] > src[2])
-	    {
-		id[0] = 1;
-		id[1] = 2;
-	    }
-	    else
-	    {
-		id[0] = 2;
-		id[1] = 1;
-	    }
-	}
-    }
-
-    max = dest[id[0]];
-    min = dest[id[2]];
-    if (max > min)
-    {
-	dest[id[1]] = (dest[id[1]] - min) * sat / (max - min);
-	dest[id[0]] = sat;
-	dest[id[2]] = 0;
-    }
-    else
-    {
-	dest[0] = dest[1] = dest[2] = 0;
-    }
-}
-
-/*
- * Hue:
- * B(Cb, Cs) = set_lum (set_sat (Cs, SAT (Cb)), LUM (Cb))
- */
-static inline void
-blend_hsl_hue (uint64_t c[3],
-               uint64_t dc[3],
-               uint64_t da,
-               uint64_t sc[3],
-               uint64_t sa)
-{
-    c[0] = sc[0] * da;
-    c[1] = sc[1] * da;
-    c[2] = sc[2] * da;
-    set_sat (c, c, SAT (dc) * sa);
-    set_lum (c, c, sa * da, LUM (dc) * sa);
-}
-
-PDF_NON_SEPARABLE_BLEND_MODE (hsl_hue)
-
-/*
- * Saturation:
- * B(Cb, Cs) = set_lum (set_sat (Cb, SAT (Cs)), LUM (Cb))
- */
-static inline void
-blend_hsl_saturation (uint64_t c[3],
-                      uint64_t dc[3],
-                      uint64_t da,
-                      uint64_t sc[3],
-                      uint64_t sa)
-{
-    c[0] = dc[0] * sa;
-    c[1] = dc[1] * sa;
-    c[2] = dc[2] * sa;
-    set_sat (c, c, SAT (sc) * da);
-    set_lum (c, c, sa * da, LUM (dc) * sa);
-}
-
-PDF_NON_SEPARABLE_BLEND_MODE (hsl_saturation)
-
-/*
- * Color:
- * B(Cb, Cs) = set_lum (Cs, LUM (Cb))
- */
-static inline void
-blend_hsl_color (uint64_t c[3],
-                 uint64_t dc[3],
-                 uint64_t da,
-                 uint64_t sc[3],
-                 uint64_t sa)
-{
-    c[0] = sc[0] * da;
-    c[1] = sc[1] * da;
-    c[2] = sc[2] * da;
-    set_lum (c, c, sa * da, LUM (dc) * sa);
-}
-
-PDF_NON_SEPARABLE_BLEND_MODE (hsl_color)
-
-/*
- * Luminosity:
- * B(Cb, Cs) = set_lum (Cb, LUM (Cs))
- */
-static inline void
-blend_hsl_luminosity (uint64_t c[3],
-                      uint64_t dc[3],
-                      uint64_t da,
-                      uint64_t sc[3],
-                      uint64_t sa)
-{
-    c[0] = dc[0] * sa;
-    c[1] = dc[1] * sa;
-    c[2] = dc[2] * sa;
-    set_lum (c, c, sa * da, LUM (sc) * da);
-}
-
-PDF_NON_SEPARABLE_BLEND_MODE (hsl_luminosity)
-
-#undef SAT
-#undef LUM
-#undef CH_MAX
-#undef CH_MIN
-#undef PDF_NON_SEPARABLE_BLEND_MODE
-
-/* Overlay
- *
- * All of the disjoint composing functions
- *
- * The four entries in the first column indicate what source contributions
- * come from each of the four areas of the picture -- areas covered by neither
- * A nor B, areas covered only by A, areas covered only by B and finally
- * areas covered by both A and B.
- * 
- * Disjoint			Conjoint
- * Fa		Fb		Fa		Fb
- * (0,0,0,0)	0		0		0		0
- * (0,A,0,A)	1		0		1		0
- * (0,0,B,B)	0		1		0		1
- * (0,A,B,A)	1		min((1-a)/b,1)	1		max(1-a/b,0)
- * (0,A,B,B)	min((1-b)/a,1)	1		max(1-b/a,0)	1
- * (0,0,0,A)	max(1-(1-b)/a,0) 0		min(1,b/a)	0
- * (0,0,0,B)	0		max(1-(1-a)/b,0) 0		min(a/b,1)
- * (0,A,0,0)	min(1,(1-b)/a)	0		max(1-b/a,0)	0
- * (0,0,B,0)	0		min(1,(1-a)/b)	0		max(1-a/b,0)
- * (0,0,B,A)	max(1-(1-b)/a,0) min(1,(1-a)/b)	 min(1,b/a)	max(1-a/b,0)
- * (0,A,0,B)	min(1,(1-b)/a)	max(1-(1-a)/b,0) max(1-b/a,0)	min(1,a/b)
- * (0,A,B,0)	min(1,(1-b)/a)	min(1,(1-a)/b)	max(1-b/a,0)	max(1-a/b,0)
- */
-
-#define COMBINE_A_OUT 1
-#define COMBINE_A_IN  2
-#define COMBINE_B_OUT 4
-#define COMBINE_B_IN  8
-
-#define COMBINE_CLEAR   0
-#define COMBINE_A       (COMBINE_A_OUT | COMBINE_A_IN)
-#define COMBINE_B       (COMBINE_B_OUT | COMBINE_B_IN)
-#define COMBINE_A_OVER  (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_A_IN)
-#define COMBINE_B_OVER  (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_B_IN)
-#define COMBINE_A_ATOP  (COMBINE_B_OUT | COMBINE_A_IN)
-#define COMBINE_B_ATOP  (COMBINE_A_OUT | COMBINE_B_IN)
-#define COMBINE_XOR     (COMBINE_A_OUT | COMBINE_B_OUT)
-
-/* portion covered by a but not b */
-static uint16_t
-combine_disjoint_out_part (uint16_t a, uint16_t b)
-{
-    /* min (1, (1-b) / a) */
-
-    b = ~b;                 /* 1 - b */
-    if (b >= a)             /* 1 - b >= a -> (1-b)/a >= 1 */
-	return MASK;        /* 1 */
-    return DIV_UN16 (b, a);     /* (1-b) / a */
-}
-
-/* portion covered by both a and b */
-static uint16_t
-combine_disjoint_in_part (uint16_t a, uint16_t b)
-{
-    /* max (1-(1-b)/a,0) */
-    /*  = - min ((1-b)/a - 1, 0) */
-    /*  = 1 - min (1, (1-b)/a) */
-
-    b = ~b;                 /* 1 - b */
-    if (b >= a)             /* 1 - b >= a -> (1-b)/a >= 1 */
-	return 0;           /* 1 - 1 */
-    return ~DIV_UN16(b, a);    /* 1 - (1-b) / a */
-}
-
-/* portion covered by a but not b */
-static uint16_t
-combine_conjoint_out_part (uint16_t a, uint16_t b)
-{
-    /* max (1-b/a,0) */
-    /* = 1-min(b/a,1) */
-
-    /* min (1, (1-b) / a) */
-
-    if (b >= a)             /* b >= a -> b/a >= 1 */
-	return 0x00;        /* 0 */
-    return ~DIV_UN16(b, a);    /* 1 - b/a */
-}
-
-/* portion covered by both a and b */
-static uint16_t
-combine_conjoint_in_part (uint16_t a, uint16_t b)
-{
-    /* min (1,b/a) */
-
-    if (b >= a)             /* b >= a -> b/a >= 1 */
-	return MASK;        /* 1 */
-    return DIV_UN16 (b, a);     /* b/a */
-}
-
-#define GET_COMP(v, i)   ((uint32_t) (uint16_t) ((v) >> i))
-
-#define ADD(x, y, i, t)							\
-    ((t) = GET_COMP (x, i) + GET_COMP (y, i),				\
-     (uint64_t) ((uint16_t) ((t) | (0 - ((t) >> G_SHIFT)))) << (i))
-
-#define GENERIC(x, y, i, ax, ay, t, u, v)				\
-    ((t) = (MUL_UN16 (GET_COMP (y, i), ay, (u)) +			\
-            MUL_UN16 (GET_COMP (x, i), ax, (v))),			\
-     (uint64_t) ((uint16_t) ((t) |					\
-                           (0 - ((t) >> G_SHIFT)))) << (i))
-
-static void
-combine_disjoint_general_u (uint64_t *      dest,
-                            const uint64_t *src,
-                            const uint64_t *mask,
-                            int            width,
-                            uint16_t        combine)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t s = combine_mask (src, mask, i);
-	uint64_t d = *(dest + i);
-	uint64_t m, n, o, p;
-	uint32_t Fa, Fb, t, u, v;
-	uint16_t sa = s >> A_SHIFT;
-	uint16_t da = d >> A_SHIFT;
-
-	switch (combine & COMBINE_A)
-	{
-	default:
-	    Fa = 0;
-	    break;
-
-	case COMBINE_A_OUT:
-	    Fa = combine_disjoint_out_part (sa, da);
-	    break;
-
-	case COMBINE_A_IN:
-	    Fa = combine_disjoint_in_part (sa, da);
-	    break;
-
-	case COMBINE_A:
-	    Fa = MASK;
-	    break;
-	}
-
-	switch (combine & COMBINE_B)
-	{
-	default:
-	    Fb = 0;
-	    break;
-
-	case COMBINE_B_OUT:
-	    Fb = combine_disjoint_out_part (da, sa);
-	    break;
-
-	case COMBINE_B_IN:
-	    Fb = combine_disjoint_in_part (da, sa);
-	    break;
-
-	case COMBINE_B:
-	    Fb = MASK;
-	    break;
-	}
-	m = GENERIC (s, d, 0, Fa, Fb, t, u, v);
-	n = GENERIC (s, d, G_SHIFT, Fa, Fb, t, u, v);
-	o = GENERIC (s, d, R_SHIFT, Fa, Fb, t, u, v);
-	p = GENERIC (s, d, A_SHIFT, Fa, Fb, t, u, v);
-	s = m | n | o | p;
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_disjoint_over_u (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         uint64_t *                dest,
-                         const uint64_t *          src,
-                         const uint64_t *          mask,
-                         int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t s = combine_mask (src, mask, i);
-	uint32_t a = s >> A_SHIFT;
-
-	if (s != 0x00)
-	{
-	    uint64_t d = *(dest + i);
-	    a = combine_disjoint_out_part (d >> A_SHIFT, a);
-	    UN16x4_MUL_UN16_ADD_UN16x4 (d, a, s);
-
-	    *(dest + i) = d;
-	}
-    }
-}
-
-static void
-combine_disjoint_in_u (pixman_implementation_t *imp,
-                       pixman_op_t              op,
-                       uint64_t *                dest,
-                       const uint64_t *          src,
-                       const uint64_t *          mask,
-                       int                      width)
-{
-    combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
-}
-
-static void
-combine_disjoint_in_reverse_u (pixman_implementation_t *imp,
-                               pixman_op_t              op,
-                               uint64_t *                dest,
-                               const uint64_t *          src,
-                               const uint64_t *          mask,
-                               int                      width)
-{
-    combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
-}
-
-static void
-combine_disjoint_out_u (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        uint64_t *                dest,
-                        const uint64_t *          src,
-                        const uint64_t *          mask,
-                        int                      width)
-{
-    combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
-}
-
-static void
-combine_disjoint_out_reverse_u (pixman_implementation_t *imp,
-                                pixman_op_t              op,
-                                uint64_t *                dest,
-                                const uint64_t *          src,
-                                const uint64_t *          mask,
-                                int                      width)
-{
-    combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
-}
-
-static void
-combine_disjoint_atop_u (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         uint64_t *                dest,
-                         const uint64_t *          src,
-                         const uint64_t *          mask,
-                         int                      width)
-{
-    combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
-}
-
-static void
-combine_disjoint_atop_reverse_u (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 uint64_t *                dest,
-                                 const uint64_t *          src,
-                                 const uint64_t *          mask,
-                                 int                      width)
-{
-    combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
-}
-
-static void
-combine_disjoint_xor_u (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        uint64_t *                dest,
-                        const uint64_t *          src,
-                        const uint64_t *          mask,
-                        int                      width)
-{
-    combine_disjoint_general_u (dest, src, mask, width, COMBINE_XOR);
-}
-
-static void
-combine_conjoint_general_u (uint64_t *      dest,
-                            const uint64_t *src,
-                            const uint64_t *mask,
-                            int            width,
-                            uint16_t        combine)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t s = combine_mask (src, mask, i);
-	uint64_t d = *(dest + i);
-	uint64_t m, n, o, p;
-	uint32_t Fa, Fb, t, u, v;
-	uint16_t sa = s >> A_SHIFT;
-	uint16_t da = d >> A_SHIFT;
-
-	switch (combine & COMBINE_A)
-	{
-	default:
-	    Fa = 0;
-	    break;
-
-	case COMBINE_A_OUT:
-	    Fa = combine_conjoint_out_part (sa, da);
-	    break;
-
-	case COMBINE_A_IN:
-	    Fa = combine_conjoint_in_part (sa, da);
-	    break;
-
-	case COMBINE_A:
-	    Fa = MASK;
-	    break;
-	}
-
-	switch (combine & COMBINE_B)
-	{
-	default:
-	    Fb = 0;
-	    break;
-
-	case COMBINE_B_OUT:
-	    Fb = combine_conjoint_out_part (da, sa);
-	    break;
-
-	case COMBINE_B_IN:
-	    Fb = combine_conjoint_in_part (da, sa);
-	    break;
-
-	case COMBINE_B:
-	    Fb = MASK;
-	    break;
-	}
-
-	m = GENERIC (s, d, 0, Fa, Fb, t, u, v);
-	n = GENERIC (s, d, G_SHIFT, Fa, Fb, t, u, v);
-	o = GENERIC (s, d, R_SHIFT, Fa, Fb, t, u, v);
-	p = GENERIC (s, d, A_SHIFT, Fa, Fb, t, u, v);
-
-	s = m | n | o | p;
-
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_conjoint_over_u (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         uint64_t *                dest,
-                         const uint64_t *          src,
-                         const uint64_t *          mask,
-                         int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OVER);
-}
-
-static void
-combine_conjoint_over_reverse_u (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 uint64_t *                dest,
-                                 const uint64_t *          src,
-                                 const uint64_t *          mask,
-                                 int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OVER);
-}
-
-static void
-combine_conjoint_in_u (pixman_implementation_t *imp,
-                       pixman_op_t              op,
-                       uint64_t *                dest,
-                       const uint64_t *          src,
-                       const uint64_t *          mask,
-                       int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
-}
-
-static void
-combine_conjoint_in_reverse_u (pixman_implementation_t *imp,
-                               pixman_op_t              op,
-                               uint64_t *                dest,
-                               const uint64_t *          src,
-                               const uint64_t *          mask,
-                               int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
-}
-
-static void
-combine_conjoint_out_u (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        uint64_t *                dest,
-                        const uint64_t *          src,
-                        const uint64_t *          mask,
-                        int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
-}
-
-static void
-combine_conjoint_out_reverse_u (pixman_implementation_t *imp,
-                                pixman_op_t              op,
-                                uint64_t *                dest,
-                                const uint64_t *          src,
-                                const uint64_t *          mask,
-                                int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
-}
-
-static void
-combine_conjoint_atop_u (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         uint64_t *                dest,
-                         const uint64_t *          src,
-                         const uint64_t *          mask,
-                         int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
-}
-
-static void
-combine_conjoint_atop_reverse_u (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 uint64_t *                dest,
-                                 const uint64_t *          src,
-                                 const uint64_t *          mask,
-                                 int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
-}
-
-static void
-combine_conjoint_xor_u (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        uint64_t *                dest,
-                        const uint64_t *          src,
-                        const uint64_t *          mask,
-                        int                      width)
-{
-    combine_conjoint_general_u (dest, src, mask, width, COMBINE_XOR);
-}
-
-/************************************************************************/
-/*********************** Per Channel functions **************************/
-/************************************************************************/
-
-static void
-combine_clear_ca (pixman_implementation_t *imp,
-                  pixman_op_t              op,
-                  uint64_t *                dest,
-                  const uint64_t *          src,
-                  const uint64_t *          mask,
-                  int                      width)
-{
-    memset (dest, 0, width * sizeof(uint64_t));
-}
-
-static void
-combine_src_ca (pixman_implementation_t *imp,
-                pixman_op_t              op,
-                uint64_t *                dest,
-                const uint64_t *          src,
-                const uint64_t *          mask,
-                int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t s = *(src + i);
-	uint64_t m = *(mask + i);
-
-	combine_mask_value_ca (&s, &m);
-
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_over_ca (pixman_implementation_t *imp,
-                 pixman_op_t              op,
-                 uint64_t *                dest,
-                 const uint64_t *          src,
-                 const uint64_t *          mask,
-                 int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t s = *(src + i);
-	uint64_t m = *(mask + i);
-	uint64_t a;
-
-	combine_mask_ca (&s, &m);
-
-	a = ~m;
-	if (a)
-	{
-	    uint64_t d = *(dest + i);
-	    UN16x4_MUL_UN16x4_ADD_UN16x4 (d, a, s);
-	    s = d;
-	}
-
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_over_reverse_ca (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         uint64_t *                dest,
-                         const uint64_t *          src,
-                         const uint64_t *          mask,
-                         int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t d = *(dest + i);
-	uint64_t a = ~d >> A_SHIFT;
-
-	if (a)
-	{
-	    uint64_t s = *(src + i);
-	    uint64_t m = *(mask + i);
-
-	    UN16x4_MUL_UN16x4 (s, m);
-	    UN16x4_MUL_UN16_ADD_UN16x4 (s, a, d);
-
-	    *(dest + i) = s;
-	}
-    }
-}
-
-static void
-combine_in_ca (pixman_implementation_t *imp,
-               pixman_op_t              op,
-               uint64_t *                dest,
-               const uint64_t *          src,
-               const uint64_t *          mask,
-               int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t d = *(dest + i);
-	uint32_t a = d >> A_SHIFT;
-	uint64_t s = 0;
-
-	if (a)
-	{
-	    uint64_t m = *(mask + i);
-
-	    s = *(src + i);
-	    combine_mask_value_ca (&s, &m);
-
-	    if (a != MASK)
-		UN16x4_MUL_UN16 (s, a);
-	}
-
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_in_reverse_ca (pixman_implementation_t *imp,
-                       pixman_op_t              op,
-                       uint64_t *                dest,
-                       const uint64_t *          src,
-                       const uint64_t *          mask,
-                       int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t s = *(src + i);
-	uint64_t m = *(mask + i);
-	uint64_t a;
-
-	combine_mask_alpha_ca (&s, &m);
-
-	a = m;
-	if (a != ~0)
-	{
-	    uint64_t d = 0;
-
-	    if (a)
-	    {
-		d = *(dest + i);
-		UN16x4_MUL_UN16x4 (d, a);
-	    }
-
-	    *(dest + i) = d;
-	}
-    }
-}
-
-static void
-combine_out_ca (pixman_implementation_t *imp,
-                pixman_op_t              op,
-                uint64_t *                dest,
-                const uint64_t *          src,
-                const uint64_t *          mask,
-                int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t d = *(dest + i);
-	uint32_t a = ~d >> A_SHIFT;
-	uint64_t s = 0;
-
-	if (a)
-	{
-	    uint64_t m = *(mask + i);
-
-	    s = *(src + i);
-	    combine_mask_value_ca (&s, &m);
-
-	    if (a != MASK)
-		UN16x4_MUL_UN16 (s, a);
-	}
-
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_out_reverse_ca (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        uint64_t *                dest,
-                        const uint64_t *          src,
-                        const uint64_t *          mask,
-                        int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t s = *(src + i);
-	uint64_t m = *(mask + i);
-	uint64_t a;
-
-	combine_mask_alpha_ca (&s, &m);
-
-	a = ~m;
-	if (a != ~0)
-	{
-	    uint64_t d = 0;
-
-	    if (a)
-	    {
-		d = *(dest + i);
-		UN16x4_MUL_UN16x4 (d, a);
-	    }
-
-	    *(dest + i) = d;
-	}
-    }
-}
-
-static void
-combine_atop_ca (pixman_implementation_t *imp,
-                 pixman_op_t              op,
-                 uint64_t *                dest,
-                 const uint64_t *          src,
-                 const uint64_t *          mask,
-                 int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t d = *(dest + i);
-	uint64_t s = *(src + i);
-	uint64_t m = *(mask + i);
-	uint64_t ad;
-	uint32_t as = d >> A_SHIFT;
-
-	combine_mask_ca (&s, &m);
-
-	ad = ~m;
-
-	UN16x4_MUL_UN16x4_ADD_UN16x4_MUL_UN16 (d, ad, s, as);
-
-	*(dest + i) = d;
-    }
-}
-
-static void
-combine_atop_reverse_ca (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         uint64_t *                dest,
-                         const uint64_t *          src,
-                         const uint64_t *          mask,
-                         int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t d = *(dest + i);
-	uint64_t s = *(src + i);
-	uint64_t m = *(mask + i);
-	uint64_t ad;
-	uint32_t as = ~d >> A_SHIFT;
-
-	combine_mask_ca (&s, &m);
-
-	ad = m;
-
-	UN16x4_MUL_UN16x4_ADD_UN16x4_MUL_UN16 (d, ad, s, as);
-
-	*(dest + i) = d;
-    }
-}
-
-static void
-combine_xor_ca (pixman_implementation_t *imp,
-                pixman_op_t              op,
-                uint64_t *                dest,
-                const uint64_t *          src,
-                const uint64_t *          mask,
-                int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t d = *(dest + i);
-	uint64_t s = *(src + i);
-	uint64_t m = *(mask + i);
-	uint64_t ad;
-	uint32_t as = ~d >> A_SHIFT;
-
-	combine_mask_ca (&s, &m);
-
-	ad = ~m;
-
-	UN16x4_MUL_UN16x4_ADD_UN16x4_MUL_UN16 (d, ad, s, as);
-
-	*(dest + i) = d;
-    }
-}
-
-static void
-combine_add_ca (pixman_implementation_t *imp,
-                pixman_op_t              op,
-                uint64_t *                dest,
-                const uint64_t *          src,
-                const uint64_t *          mask,
-                int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t s = *(src + i);
-	uint64_t m = *(mask + i);
-	uint64_t d = *(dest + i);
-
-	combine_mask_value_ca (&s, &m);
-
-	UN16x4_ADD_UN16x4 (d, s);
-
-	*(dest + i) = d;
-    }
-}
-
-static void
-combine_saturate_ca (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint64_t *                dest,
-                     const uint64_t *          src,
-                     const uint64_t *          mask,
-                     int                      width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t s, d;
-	uint32_t sa, sr, sg, sb, da;
-	uint32_t t, u, v;
-	uint64_t m, n, o, p;
-
-	d = *(dest + i);
-	s = *(src + i);
-	m = *(mask + i);
-
-	combine_mask_ca (&s, &m);
-
-	sa = (m >> A_SHIFT);
-	sr = (m >> R_SHIFT) & MASK;
-	sg = (m >> G_SHIFT) & MASK;
-	sb =  m             & MASK;
-	da = ~d >> A_SHIFT;
-
-	if (sb <= da)
-	    m = ADD (s, d, 0, t);
-	else
-	    m = GENERIC (s, d, 0, (da << G_SHIFT) / sb, MASK, t, u, v);
-
-	if (sg <= da)
-	    n = ADD (s, d, G_SHIFT, t);
-	else
-	    n = GENERIC (s, d, G_SHIFT, (da << G_SHIFT) / sg, MASK, t, u, v);
-
-	if (sr <= da)
-	    o = ADD (s, d, R_SHIFT, t);
-	else
-	    o = GENERIC (s, d, R_SHIFT, (da << G_SHIFT) / sr, MASK, t, u, v);
-
-	if (sa <= da)
-	    p = ADD (s, d, A_SHIFT, t);
-	else
-	    p = GENERIC (s, d, A_SHIFT, (da << G_SHIFT) / sa, MASK, t, u, v);
-
-	*(dest + i) = m | n | o | p;
-    }
-}
-
-static void
-combine_disjoint_general_ca (uint64_t *      dest,
-                             const uint64_t *src,
-                             const uint64_t *mask,
-                             int            width,
-                             uint16_t        combine)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t s, d;
-	uint64_t m, n, o, p;
-	uint64_t Fa, Fb;
-	uint32_t t, u, v;
-	uint64_t sa;
-	uint16_t da;
-
-	s = *(src + i);
-	m = *(mask + i);
-	d = *(dest + i);
-	da = d >> A_SHIFT;
-
-	combine_mask_ca (&s, &m);
-
-	sa = m;
-
-	switch (combine & COMBINE_A)
-	{
-	default:
-	    Fa = 0;
-	    break;
-
-	case COMBINE_A_OUT:
-	    m = (uint64_t)combine_disjoint_out_part ((uint16_t) (sa >> 0), da);
-	    n = (uint64_t)combine_disjoint_out_part ((uint16_t) (sa >> G_SHIFT), da) << G_SHIFT;
-	    o = (uint64_t)combine_disjoint_out_part ((uint16_t) (sa >> R_SHIFT), da) << R_SHIFT;
-	    p = (uint64_t)combine_disjoint_out_part ((uint16_t) (sa >> A_SHIFT), da) << A_SHIFT;
-	    Fa = m | n | o | p;
-	    break;
-
-	case COMBINE_A_IN:
-	    m = (uint64_t)combine_disjoint_in_part ((uint16_t) (sa >> 0), da);
-	    n = (uint64_t)combine_disjoint_in_part ((uint16_t) (sa >> G_SHIFT), da) << G_SHIFT;
-	    o = (uint64_t)combine_disjoint_in_part ((uint16_t) (sa >> R_SHIFT), da) << R_SHIFT;
-	    p = (uint64_t)combine_disjoint_in_part ((uint16_t) (sa >> A_SHIFT), da) << A_SHIFT;
-	    Fa = m | n | o | p;
-	    break;
-
-	case COMBINE_A:
-	    Fa = ~0;
-	    break;
-	}
-
-	switch (combine & COMBINE_B)
-	{
-	default:
-	    Fb = 0;
-	    break;
-
-	case COMBINE_B_OUT:
-	    m = (uint64_t)combine_disjoint_out_part (da, (uint16_t) (sa >> 0));
-	    n = (uint64_t)combine_disjoint_out_part (da, (uint16_t) (sa >> G_SHIFT)) << G_SHIFT;
-	    o = (uint64_t)combine_disjoint_out_part (da, (uint16_t) (sa >> R_SHIFT)) << R_SHIFT;
-	    p = (uint64_t)combine_disjoint_out_part (da, (uint16_t) (sa >> A_SHIFT)) << A_SHIFT;
-	    Fb = m | n | o | p;
-	    break;
-
-	case COMBINE_B_IN:
-	    m = (uint64_t)combine_disjoint_in_part (da, (uint16_t) (sa >> 0));
-	    n = (uint64_t)combine_disjoint_in_part (da, (uint16_t) (sa >> G_SHIFT)) << G_SHIFT;
-	    o = (uint64_t)combine_disjoint_in_part (da, (uint16_t) (sa >> R_SHIFT)) << R_SHIFT;
-	    p = (uint64_t)combine_disjoint_in_part (da, (uint16_t) (sa >> A_SHIFT)) << A_SHIFT;
-	    Fb = m | n | o | p;
-	    break;
-
-	case COMBINE_B:
-	    Fb = ~0;
-	    break;
-	}
-	m = GENERIC (s, d, 0, GET_COMP (Fa, 0), GET_COMP (Fb, 0), t, u, v);
-	n = GENERIC (s, d, G_SHIFT, GET_COMP (Fa, G_SHIFT), GET_COMP (Fb, G_SHIFT), t, u, v);
-	o = GENERIC (s, d, R_SHIFT, GET_COMP (Fa, R_SHIFT), GET_COMP (Fb, R_SHIFT), t, u, v);
-	p = GENERIC (s, d, A_SHIFT, GET_COMP (Fa, A_SHIFT), GET_COMP (Fb, A_SHIFT), t, u, v);
-
-	s = m | n | o | p;
-
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_disjoint_over_ca (pixman_implementation_t *imp,
-                          pixman_op_t              op,
-                          uint64_t *                dest,
-                          const uint64_t *          src,
-                          const uint64_t *          mask,
-                          int                      width)
-{
-    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER);
-}
-
-static void
-combine_disjoint_in_ca (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        uint64_t *                dest,
-                        const uint64_t *          src,
-                        const uint64_t *          mask,
-                        int                      width)
-{
-    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_IN);
-}
-
-static void
-combine_disjoint_in_reverse_ca (pixman_implementation_t *imp,
-                                pixman_op_t              op,
-                                uint64_t *                dest,
-                                const uint64_t *          src,
-                                const uint64_t *          mask,
-                                int                      width)
-{
-    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_IN);
-}
-
-static void
-combine_disjoint_out_ca (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         uint64_t *                dest,
-                         const uint64_t *          src,
-                         const uint64_t *          mask,
-                         int                      width)
-{
-    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT);
-}
-
-static void
-combine_disjoint_out_reverse_ca (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 uint64_t *                dest,
-                                 const uint64_t *          src,
-                                 const uint64_t *          mask,
-                                 int                      width)
-{
-    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT);
-}
-
-static void
-combine_disjoint_atop_ca (pixman_implementation_t *imp,
-                          pixman_op_t              op,
-                          uint64_t *                dest,
-                          const uint64_t *          src,
-                          const uint64_t *          mask,
-                          int                      width)
-{
-    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP);
-}
-
-static void
-combine_disjoint_atop_reverse_ca (pixman_implementation_t *imp,
-                                  pixman_op_t              op,
-                                  uint64_t *                dest,
-                                  const uint64_t *          src,
-                                  const uint64_t *          mask,
-                                  int                      width)
-{
-    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP);
-}
-
-static void
-combine_disjoint_xor_ca (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         uint64_t *                dest,
-                         const uint64_t *          src,
-                         const uint64_t *          mask,
-                         int                      width)
-{
-    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_XOR);
-}
-
-static void
-combine_conjoint_general_ca (uint64_t *      dest,
-                             const uint64_t *src,
-                             const uint64_t *mask,
-                             int            width,
-                             uint16_t        combine)
-{
-    int i;
-
-    for (i = 0; i < width; ++i)
-    {
-	uint64_t s, d;
-	uint64_t m, n, o, p;
-	uint64_t Fa, Fb;
-	uint32_t t, u, v;
-	uint64_t sa;
-	uint16_t da;
-
-	s = *(src + i);
-	m = *(mask + i);
-	d = *(dest + i);
-	da = d >> A_SHIFT;
-
-	combine_mask_ca (&s, &m);
-
-	sa = m;
-
-	switch (combine & COMBINE_A)
-	{
-	default:
-	    Fa = 0;
-	    break;
-
-	case COMBINE_A_OUT:
-	    m = (uint64_t)combine_conjoint_out_part ((uint16_t) (sa >> 0), da);
-	    n = (uint64_t)combine_conjoint_out_part ((uint16_t) (sa >> G_SHIFT), da) << G_SHIFT;
-	    o = (uint64_t)combine_conjoint_out_part ((uint16_t) (sa >> R_SHIFT), da) << R_SHIFT;
-	    p = (uint64_t)combine_conjoint_out_part ((uint16_t) (sa >> A_SHIFT), da) << A_SHIFT;
-	    Fa = m | n | o | p;
-	    break;
-
-	case COMBINE_A_IN:
-	    m = (uint64_t)combine_conjoint_in_part ((uint16_t) (sa >> 0), da);
-	    n = (uint64_t)combine_conjoint_in_part ((uint16_t) (sa >> G_SHIFT), da) << G_SHIFT;
-	    o = (uint64_t)combine_conjoint_in_part ((uint16_t) (sa >> R_SHIFT), da) << R_SHIFT;
-	    p = (uint64_t)combine_conjoint_in_part ((uint16_t) (sa >> A_SHIFT), da) << A_SHIFT;
-	    Fa = m | n | o | p;
-	    break;
-
-	case COMBINE_A:
-	    Fa = ~0;
-	    break;
-	}
-
-	switch (combine & COMBINE_B)
-	{
-	default:
-	    Fb = 0;
-	    break;
-
-	case COMBINE_B_OUT:
-	    m = (uint64_t)combine_conjoint_out_part (da, (uint16_t) (sa >> 0));
-	    n = (uint64_t)combine_conjoint_out_part (da, (uint16_t) (sa >> G_SHIFT)) << G_SHIFT;
-	    o = (uint64_t)combine_conjoint_out_part (da, (uint16_t) (sa >> R_SHIFT)) << R_SHIFT;
-	    p = (uint64_t)combine_conjoint_out_part (da, (uint16_t) (sa >> A_SHIFT)) << A_SHIFT;
-	    Fb = m | n | o | p;
-	    break;
-
-	case COMBINE_B_IN:
-	    m = (uint64_t)combine_conjoint_in_part (da, (uint16_t) (sa >> 0));
-	    n = (uint64_t)combine_conjoint_in_part (da, (uint16_t) (sa >> G_SHIFT)) << G_SHIFT;
-	    o = (uint64_t)combine_conjoint_in_part (da, (uint16_t) (sa >> R_SHIFT)) << R_SHIFT;
-	    p = (uint64_t)combine_conjoint_in_part (da, (uint16_t) (sa >> A_SHIFT)) << A_SHIFT;
-	    Fb = m | n | o | p;
-	    break;
-
-	case COMBINE_B:
-	    Fb = ~0;
-	    break;
-	}
-	m = GENERIC (s, d, 0, GET_COMP (Fa, 0), GET_COMP (Fb, 0), t, u, v);
-	n = GENERIC (s, d, G_SHIFT, GET_COMP (Fa, G_SHIFT), GET_COMP (Fb, G_SHIFT), t, u, v);
-	o = GENERIC (s, d, R_SHIFT, GET_COMP (Fa, R_SHIFT), GET_COMP (Fb, R_SHIFT), t, u, v);
-	p = GENERIC (s, d, A_SHIFT, GET_COMP (Fa, A_SHIFT), GET_COMP (Fb, A_SHIFT), t, u, v);
-
-	s = m | n | o | p;
-
-	*(dest + i) = s;
-    }
-}
-
-static void
-combine_conjoint_over_ca (pixman_implementation_t *imp,
-                          pixman_op_t              op,
-                          uint64_t *                dest,
-                          const uint64_t *          src,
-                          const uint64_t *          mask,
-                          int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER);
-}
-
-static void
-combine_conjoint_over_reverse_ca (pixman_implementation_t *imp,
-                                  pixman_op_t              op,
-                                  uint64_t *                dest,
-                                  const uint64_t *          src,
-                                  const uint64_t *          mask,
-                                  int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OVER);
-}
-
-static void
-combine_conjoint_in_ca (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        uint64_t *                dest,
-                        const uint64_t *          src,
-                        const uint64_t *          mask,
-                        int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_IN);
-}
-
-static void
-combine_conjoint_in_reverse_ca (pixman_implementation_t *imp,
-                                pixman_op_t              op,
-                                uint64_t *                dest,
-                                const uint64_t *          src,
-                                const uint64_t *          mask,
-                                int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_IN);
-}
-
-static void
-combine_conjoint_out_ca (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         uint64_t *                dest,
-                         const uint64_t *          src,
-                         const uint64_t *          mask,
-                         int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT);
-}
-
-static void
-combine_conjoint_out_reverse_ca (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 uint64_t *                dest,
-                                 const uint64_t *          src,
-                                 const uint64_t *          mask,
-                                 int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT);
-}
-
-static void
-combine_conjoint_atop_ca (pixman_implementation_t *imp,
-                          pixman_op_t              op,
-                          uint64_t *                dest,
-                          const uint64_t *          src,
-                          const uint64_t *          mask,
-                          int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP);
-}
-
-static void
-combine_conjoint_atop_reverse_ca (pixman_implementation_t *imp,
-                                  pixman_op_t              op,
-                                  uint64_t *                dest,
-                                  const uint64_t *          src,
-                                  const uint64_t *          mask,
-                                  int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP);
-}
-
-static void
-combine_conjoint_xor_ca (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         uint64_t *                dest,
-                         const uint64_t *          src,
-                         const uint64_t *          mask,
-                         int                      width)
-{
-    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_XOR);
-}
-
-void
-_pixman_setup_combiner_functions_64 (pixman_implementation_t *imp)
-{
-    /* Unified alpha */
-    imp->combine_64[PIXMAN_OP_CLEAR] = combine_clear;
-    imp->combine_64[PIXMAN_OP_SRC] = combine_src_u;
-    imp->combine_64[PIXMAN_OP_DST] = combine_dst;
-    imp->combine_64[PIXMAN_OP_OVER] = combine_over_u;
-    imp->combine_64[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_u;
-    imp->combine_64[PIXMAN_OP_IN] = combine_in_u;
-    imp->combine_64[PIXMAN_OP_IN_REVERSE] = combine_in_reverse_u;
-    imp->combine_64[PIXMAN_OP_OUT] = combine_out_u;
-    imp->combine_64[PIXMAN_OP_OUT_REVERSE] = combine_out_reverse_u;
-    imp->combine_64[PIXMAN_OP_ATOP] = combine_atop_u;
-    imp->combine_64[PIXMAN_OP_ATOP_REVERSE] = combine_atop_reverse_u;
-    imp->combine_64[PIXMAN_OP_XOR] = combine_xor_u;
-    imp->combine_64[PIXMAN_OP_ADD] = combine_add_u;
-    imp->combine_64[PIXMAN_OP_SATURATE] = combine_saturate_u;
-
-    /* Disjoint, unified */
-    imp->combine_64[PIXMAN_OP_DISJOINT_CLEAR] = combine_clear;
-    imp->combine_64[PIXMAN_OP_DISJOINT_SRC] = combine_src_u;
-    imp->combine_64[PIXMAN_OP_DISJOINT_DST] = combine_dst;
-    imp->combine_64[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_u;
-    imp->combine_64[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_saturate_u;
-    imp->combine_64[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_u;
-    imp->combine_64[PIXMAN_OP_DISJOINT_IN_REVERSE] = combine_disjoint_in_reverse_u;
-    imp->combine_64[PIXMAN_OP_DISJOINT_OUT] = combine_disjoint_out_u;
-    imp->combine_64[PIXMAN_OP_DISJOINT_OUT_REVERSE] = combine_disjoint_out_reverse_u;
-    imp->combine_64[PIXMAN_OP_DISJOINT_ATOP] = combine_disjoint_atop_u;
-    imp->combine_64[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = combine_disjoint_atop_reverse_u;
-    imp->combine_64[PIXMAN_OP_DISJOINT_XOR] = combine_disjoint_xor_u;
-
-    /* Conjoint, unified */
-    imp->combine_64[PIXMAN_OP_CONJOINT_CLEAR] = combine_clear;
-    imp->combine_64[PIXMAN_OP_CONJOINT_SRC] = combine_src_u;
-    imp->combine_64[PIXMAN_OP_CONJOINT_DST] = combine_dst;
-    imp->combine_64[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_u;
-    imp->combine_64[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_u;
-    imp->combine_64[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_u;
-    imp->combine_64[PIXMAN_OP_CONJOINT_IN_REVERSE] = combine_conjoint_in_reverse_u;
-    imp->combine_64[PIXMAN_OP_CONJOINT_OUT] = combine_conjoint_out_u;
-    imp->combine_64[PIXMAN_OP_CONJOINT_OUT_REVERSE] = combine_conjoint_out_reverse_u;
-    imp->combine_64[PIXMAN_OP_CONJOINT_ATOP] = combine_conjoint_atop_u;
-    imp->combine_64[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = combine_conjoint_atop_reverse_u;
-    imp->combine_64[PIXMAN_OP_CONJOINT_XOR] = combine_conjoint_xor_u;
-
-    imp->combine_64[PIXMAN_OP_MULTIPLY] = combine_multiply_u;
-    imp->combine_64[PIXMAN_OP_SCREEN] = combine_screen_u;
-    imp->combine_64[PIXMAN_OP_OVERLAY] = combine_overlay_u;
-    imp->combine_64[PIXMAN_OP_DARKEN] = combine_darken_u;
-    imp->combine_64[PIXMAN_OP_LIGHTEN] = combine_lighten_u;
-    imp->combine_64[PIXMAN_OP_COLOR_DODGE] = combine_color_dodge_u;
-    imp->combine_64[PIXMAN_OP_COLOR_BURN] = combine_color_burn_u;
-    imp->combine_64[PIXMAN_OP_HARD_LIGHT] = combine_hard_light_u;
-    imp->combine_64[PIXMAN_OP_SOFT_LIGHT] = combine_soft_light_u;
-    imp->combine_64[PIXMAN_OP_DIFFERENCE] = combine_difference_u;
-    imp->combine_64[PIXMAN_OP_EXCLUSION] = combine_exclusion_u;
-    imp->combine_64[PIXMAN_OP_HSL_HUE] = combine_hsl_hue_u;
-    imp->combine_64[PIXMAN_OP_HSL_SATURATION] = combine_hsl_saturation_u;
-    imp->combine_64[PIXMAN_OP_HSL_COLOR] = combine_hsl_color_u;
-    imp->combine_64[PIXMAN_OP_HSL_LUMINOSITY] = combine_hsl_luminosity_u;
-
-    /* Component alpha combiners */
-    imp->combine_64_ca[PIXMAN_OP_CLEAR] = combine_clear_ca;
-    imp->combine_64_ca[PIXMAN_OP_SRC] = combine_src_ca;
-    /* dest */
-    imp->combine_64_ca[PIXMAN_OP_OVER] = combine_over_ca;
-    imp->combine_64_ca[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_ca;
-    imp->combine_64_ca[PIXMAN_OP_IN] = combine_in_ca;
-    imp->combine_64_ca[PIXMAN_OP_IN_REVERSE] = combine_in_reverse_ca;
-    imp->combine_64_ca[PIXMAN_OP_OUT] = combine_out_ca;
-    imp->combine_64_ca[PIXMAN_OP_OUT_REVERSE] = combine_out_reverse_ca;
-    imp->combine_64_ca[PIXMAN_OP_ATOP] = combine_atop_ca;
-    imp->combine_64_ca[PIXMAN_OP_ATOP_REVERSE] = combine_atop_reverse_ca;
-    imp->combine_64_ca[PIXMAN_OP_XOR] = combine_xor_ca;
-    imp->combine_64_ca[PIXMAN_OP_ADD] = combine_add_ca;
-    imp->combine_64_ca[PIXMAN_OP_SATURATE] = combine_saturate_ca;
-
-    /* Disjoint CA */
-    imp->combine_64_ca[PIXMAN_OP_DISJOINT_CLEAR] = combine_clear_ca;
-    imp->combine_64_ca[PIXMAN_OP_DISJOINT_SRC] = combine_src_ca;
-    imp->combine_64_ca[PIXMAN_OP_DISJOINT_DST] = combine_dst;
-    imp->combine_64_ca[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_ca;
-    imp->combine_64_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_saturate_ca;
-    imp->combine_64_ca[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_ca;
-    imp->combine_64_ca[PIXMAN_OP_DISJOINT_IN_REVERSE] = combine_disjoint_in_reverse_ca;
-    imp->combine_64_ca[PIXMAN_OP_DISJOINT_OUT] = combine_disjoint_out_ca;
-    imp->combine_64_ca[PIXMAN_OP_DISJOINT_OUT_REVERSE] = combine_disjoint_out_reverse_ca;
-    imp->combine_64_ca[PIXMAN_OP_DISJOINT_ATOP] = combine_disjoint_atop_ca;
-    imp->combine_64_ca[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = combine_disjoint_atop_reverse_ca;
-    imp->combine_64_ca[PIXMAN_OP_DISJOINT_XOR] = combine_disjoint_xor_ca;
-
-    /* Conjoint CA */
-    imp->combine_64_ca[PIXMAN_OP_CONJOINT_CLEAR] = combine_clear_ca;
-    imp->combine_64_ca[PIXMAN_OP_CONJOINT_SRC] = combine_src_ca;
-    imp->combine_64_ca[PIXMAN_OP_CONJOINT_DST] = combine_dst;
-    imp->combine_64_ca[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_ca;
-    imp->combine_64_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_ca;
-    imp->combine_64_ca[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_ca;
-    imp->combine_64_ca[PIXMAN_OP_CONJOINT_IN_REVERSE] = combine_conjoint_in_reverse_ca;
-    imp->combine_64_ca[PIXMAN_OP_CONJOINT_OUT] = combine_conjoint_out_ca;
-    imp->combine_64_ca[PIXMAN_OP_CONJOINT_OUT_REVERSE] = combine_conjoint_out_reverse_ca;
-    imp->combine_64_ca[PIXMAN_OP_CONJOINT_ATOP] = combine_conjoint_atop_ca;
-    imp->combine_64_ca[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = combine_conjoint_atop_reverse_ca;
-    imp->combine_64_ca[PIXMAN_OP_CONJOINT_XOR] = combine_conjoint_xor_ca;
-
-    imp->combine_64_ca[PIXMAN_OP_MULTIPLY] = combine_multiply_ca;
-    imp->combine_64_ca[PIXMAN_OP_SCREEN] = combine_screen_ca;
-    imp->combine_64_ca[PIXMAN_OP_OVERLAY] = combine_overlay_ca;
-    imp->combine_64_ca[PIXMAN_OP_DARKEN] = combine_darken_ca;
-    imp->combine_64_ca[PIXMAN_OP_LIGHTEN] = combine_lighten_ca;
-    imp->combine_64_ca[PIXMAN_OP_COLOR_DODGE] = combine_color_dodge_ca;
-    imp->combine_64_ca[PIXMAN_OP_COLOR_BURN] = combine_color_burn_ca;
-    imp->combine_64_ca[PIXMAN_OP_HARD_LIGHT] = combine_hard_light_ca;
-    imp->combine_64_ca[PIXMAN_OP_SOFT_LIGHT] = combine_soft_light_ca;
-    imp->combine_64_ca[PIXMAN_OP_DIFFERENCE] = combine_difference_ca;
-    imp->combine_64_ca[PIXMAN_OP_EXCLUSION] = combine_exclusion_ca;
-
-    /* It is not clear that these make sense, so make them noops for now */
-    imp->combine_64_ca[PIXMAN_OP_HSL_HUE] = combine_dst;
-    imp->combine_64_ca[PIXMAN_OP_HSL_SATURATION] = combine_dst;
-    imp->combine_64_ca[PIXMAN_OP_HSL_COLOR] = combine_dst;
-    imp->combine_64_ca[PIXMAN_OP_HSL_LUMINOSITY] = combine_dst;
-}
-
diff --git a/programs/develop/libraries/pixman/pixman-combine64.h b/programs/develop/libraries/pixman/pixman-combine64.h
deleted file mode 100644
index ed9ffebf6c..0000000000
--- a/programs/develop/libraries/pixman/pixman-combine64.h
+++ /dev/null
@@ -1,230 +0,0 @@
-/* WARNING: This file is generated by combine.pl from combine.inc.
-   Please edit one of those files rather than this one. */
-
-#line 1 "pixman-combine.c.template"
-
-#define COMPONENT_SIZE 16
-#define MASK 0xffffULL
-#define ONE_HALF 0x8000ULL
-
-#define A_SHIFT 16 * 3
-#define R_SHIFT 16 * 2
-#define G_SHIFT 16
-#define A_MASK 0xffff000000000000ULL
-#define R_MASK 0xffff00000000ULL
-#define G_MASK 0xffff0000ULL
-
-#define RB_MASK 0xffff0000ffffULL
-#define AG_MASK 0xffff0000ffff0000ULL
-#define RB_ONE_HALF 0x800000008000ULL
-#define RB_MASK_PLUS_ONE 0x10000000010000ULL
-
-#define ALPHA_16(x) ((x) >> A_SHIFT)
-#define RED_16(x) (((x) >> R_SHIFT) & MASK)
-#define GREEN_16(x) (((x) >> G_SHIFT) & MASK)
-#define BLUE_16(x) ((x) & MASK)
-
-/*
- * Helper macros.
- */
-
-#define MUL_UN16(a, b, t)						\
-    ((t) = (a) * (b) + ONE_HALF, ((((t) >> G_SHIFT ) + (t) ) >> G_SHIFT ))
-
-#define DIV_UN16(a, b)							\
-    (((uint32_t) (a) * MASK) / (b))
-
-#define ADD_UN16(x, y, t)				     \
-    ((t) = (x) + (y),					     \
-     (uint64_t) (uint16_t) ((t) | (0 - ((t) >> G_SHIFT))))
-
-#define DIV_ONE_UN16(x)							\
-    (((x) + ONE_HALF + (((x) + ONE_HALF) >> G_SHIFT)) >> G_SHIFT)
-
-/*
- * The methods below use some tricks to be able to do two color
- * components at the same time.
- */
-
-/*
- * x_rb = (x_rb * a) / 255
- */
-#define UN16_rb_MUL_UN16(x, a, t)						\
-    do									\
-    {									\
-	t  = ((x) & RB_MASK) * (a);					\
-	t += RB_ONE_HALF;						\
-	x = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT;		\
-	x &= RB_MASK;							\
-    } while (0)
-
-/*
- * x_rb = min (x_rb + y_rb, 255)
- */
-#define UN16_rb_ADD_UN16_rb(x, y, t)					\
-    do									\
-    {									\
-	t = ((x) + (y));						\
-	t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK);		\
-	x = (t & RB_MASK);						\
-    } while (0)
-
-/*
- * x_rb = (x_rb * a_rb) / 255
- */
-#define UN16_rb_MUL_UN16_rb(x, a, t)					\
-    do									\
-    {									\
-	t  = (x & MASK) * (a & MASK);					\
-	t |= (x & R_MASK) * ((a >> R_SHIFT) & MASK);			\
-	t += RB_ONE_HALF;						\
-	t = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT;		\
-	x = t & RB_MASK;						\
-    } while (0)
-
-/*
- * x_c = (x_c * a) / 255
- */
-#define UN16x4_MUL_UN16(x, a)						\
-    do									\
-    {									\
-	uint64_t r1__, r2__, t__;					\
-									\
-	r1__ = (x);							\
-	UN16_rb_MUL_UN16 (r1__, (a), t__);				\
-									\
-	r2__ = (x) >> G_SHIFT;						\
-	UN16_rb_MUL_UN16 (r2__, (a), t__);				\
-									\
-	(x) = r1__ | (r2__ << G_SHIFT);					\
-    } while (0)
-
-/*
- * x_c = (x_c * a) / 255 + y_c
- */
-#define UN16x4_MUL_UN16_ADD_UN16x4(x, a, y)				\
-    do									\
-    {									\
-	uint64_t r1__, r2__, r3__, t__;					\
-									\
-	r1__ = (x);							\
-	r2__ = (y) & RB_MASK;						\
-	UN16_rb_MUL_UN16 (r1__, (a), t__);				\
-	UN16_rb_ADD_UN16_rb (r1__, r2__, t__);				\
-									\
-	r2__ = (x) >> G_SHIFT;						\
-	r3__ = ((y) >> G_SHIFT) & RB_MASK;				\
-	UN16_rb_MUL_UN16 (r2__, (a), t__);				\
-	UN16_rb_ADD_UN16_rb (r2__, r3__, t__);				\
-									\
-	(x) = r1__ | (r2__ << G_SHIFT);					\
-    } while (0)
-
-/*
- * x_c = (x_c * a + y_c * b) / 255
- */
-#define UN16x4_MUL_UN16_ADD_UN16x4_MUL_UN16(x, a, y, b)			\
-    do									\
-    {									\
-	uint64_t r1__, r2__, r3__, t__;					\
-									\
-	r1__ = (x);							\
-	r2__ = (y);							\
-	UN16_rb_MUL_UN16 (r1__, (a), t__);				\
-	UN16_rb_MUL_UN16 (r2__, (b), t__);				\
-	UN16_rb_ADD_UN16_rb (r1__, r2__, t__);				\
-									\
-	r2__ = ((x) >> G_SHIFT);					\
-	r3__ = ((y) >> G_SHIFT);					\
-	UN16_rb_MUL_UN16 (r2__, (a), t__);				\
-	UN16_rb_MUL_UN16 (r3__, (b), t__);				\
-	UN16_rb_ADD_UN16_rb (r2__, r3__, t__);				\
-									\
-	(x) = r1__ | (r2__ << G_SHIFT);					\
-    } while (0)
-
-/*
- * x_c = (x_c * a_c) / 255
- */
-#define UN16x4_MUL_UN16x4(x, a)						\
-    do									\
-    {									\
-	uint64_t r1__, r2__, r3__, t__;					\
-									\
-	r1__ = (x);							\
-	r2__ = (a);							\
-	UN16_rb_MUL_UN16_rb (r1__, r2__, t__);				\
-									\
-	r2__ = (x) >> G_SHIFT;						\
-	r3__ = (a) >> G_SHIFT;						\
-	UN16_rb_MUL_UN16_rb (r2__, r3__, t__);				\
-									\
-	(x) = r1__ | (r2__ << G_SHIFT);					\
-    } while (0)
-
-/*
- * x_c = (x_c * a_c) / 255 + y_c
- */
-#define UN16x4_MUL_UN16x4_ADD_UN16x4(x, a, y)				\
-    do									\
-    {									\
-	uint64_t r1__, r2__, r3__, t__;					\
-									\
-	r1__ = (x);							\
-	r2__ = (a);							\
-	UN16_rb_MUL_UN16_rb (r1__, r2__, t__);				\
-	r2__ = (y) & RB_MASK;						\
-	UN16_rb_ADD_UN16_rb (r1__, r2__, t__);				\
-									\
-	r2__ = ((x) >> G_SHIFT);					\
-	r3__ = ((a) >> G_SHIFT);					\
-	UN16_rb_MUL_UN16_rb (r2__, r3__, t__);				\
-	r3__ = ((y) >> G_SHIFT) & RB_MASK;				\
-	UN16_rb_ADD_UN16_rb (r2__, r3__, t__);				\
-									\
-	(x) = r1__ | (r2__ << G_SHIFT);					\
-    } while (0)
-
-/*
- * x_c = (x_c * a_c + y_c * b) / 255
- */
-#define UN16x4_MUL_UN16x4_ADD_UN16x4_MUL_UN16(x, a, y, b)			\
-    do									\
-    {									\
-	uint64_t r1__, r2__, r3__, t__;					\
-									\
-	r1__ = (x);							\
-	r2__ = (a);							\
-	UN16_rb_MUL_UN16_rb (r1__, r2__, t__);				\
-	r2__ = (y);							\
-	UN16_rb_MUL_UN16 (r2__, (b), t__);				\
-	UN16_rb_ADD_UN16_rb (r1__, r2__, t__);				\
-									\
-	r2__ = (x) >> G_SHIFT;						\
-	r3__ = (a) >> G_SHIFT;						\
-	UN16_rb_MUL_UN16_rb (r2__, r3__, t__);				\
-	r3__ = (y) >> G_SHIFT;						\
-	UN16_rb_MUL_UN16 (r3__, (b), t__);				\
-	UN16_rb_ADD_UN16_rb (r2__, r3__, t__);				\
-									\
-	x = r1__ | (r2__ << G_SHIFT);					\
-    } while (0)
-
-/*
-  x_c = min(x_c + y_c, 255)
-*/
-#define UN16x4_ADD_UN16x4(x, y)						\
-    do									\
-    {									\
-	uint64_t r1__, r2__, r3__, t__;					\
-									\
-	r1__ = (x) & RB_MASK;						\
-	r2__ = (y) & RB_MASK;						\
-	UN16_rb_ADD_UN16_rb (r1__, r2__, t__);				\
-									\
-	r2__ = ((x) >> G_SHIFT) & RB_MASK;				\
-	r3__ = ((y) >> G_SHIFT) & RB_MASK;				\
-	UN16_rb_ADD_UN16_rb (r2__, r3__, t__);				\
-									\
-	x = r1__ | (r2__ << G_SHIFT);					\
-    } while (0)
diff --git a/programs/develop/libraries/pixman/pixman-compiler.h b/programs/develop/libraries/pixman/pixman-compiler.h
index 0e08589c83..9b190b422f 100644
--- a/programs/develop/libraries/pixman/pixman-compiler.h
+++ b/programs/develop/libraries/pixman/pixman-compiler.h
@@ -18,6 +18,18 @@
 #  define FUNC     ((const char*) ("???"))
 #endif
 
+#if defined (__GNUC__)
+#  define unlikely(expr) __builtin_expect ((expr), 0)
+#else
+#  define unlikely(expr)  (expr)
+#endif
+
+#if defined (__GNUC__)
+#  define MAYBE_UNUSED  __attribute__((unused))
+#else
+#  define MAYBE_UNUSED
+#endif
+
 #ifndef INT16_MIN
 # define INT16_MIN              (-32767-1)
 #endif
@@ -42,6 +54,19 @@
 # define UINT32_MAX             (4294967295U)
 #endif
 
+#ifndef INT64_MIN
+# define INT64_MIN              (-9223372036854775807-1)
+#endif
+
+#ifndef INT64_MAX
+# define INT64_MAX              (9223372036854775807)
+#endif
+
+#ifndef SIZE_MAX
+# define SIZE_MAX               ((size_t)-1)
+#endif
+
+
 #ifndef M_PI
 # define M_PI			3.14159265358979323846
 #endif
@@ -74,6 +99,10 @@
 #   define PIXMAN_EXPORT
 #endif
 
+/* member offsets */
+#define CONTAINER_OF(type, member, data)				\
+    ((type *)(((uint8_t *)data) - offsetof (type, member)))
+
 /* TLS */
 #if defined(PIXMAN_NO_TLS)
 
@@ -82,10 +111,10 @@
 #   define PIXMAN_GET_THREAD_LOCAL(name)				\
     (&name)
 
-#elif defined(TOOLCHAIN_SUPPORTS__THREAD)
+#elif defined(TLS)
 
 #   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\
-    static __thread type name
+    static TLS type name
 #   define PIXMAN_GET_THREAD_LOCAL(name)				\
     (&name)
 
@@ -191,8 +220,7 @@
 		value = tls_ ## name ## _alloc ();			\
 	}								\
 	return value;							\
-    }									\
-    extern int no_such_variable						
+    }
 
 #   define PIXMAN_GET_THREAD_LOCAL(name)				\
     tls_ ## name ## _get ()
diff --git a/programs/develop/libraries/pixman/pixman-conical-gradient.c b/programs/develop/libraries/pixman/pixman-conical-gradient.c
index 897948be4f..8bb46aecdc 100644
--- a/programs/develop/libraries/pixman/pixman-conical-gradient.c
+++ b/programs/develop/libraries/pixman/pixman-conical-gradient.c
@@ -50,16 +50,16 @@ coordinates_to_parameter (double x, double y, double angle)
 				      */
 }
 
-static void
-conical_gradient_get_scanline_32 (pixman_image_t *image,
-                                  int             x,
-                                  int             y,
-                                  int             width,
-                                  uint32_t *      buffer,
-                                  const uint32_t *mask)
+static uint32_t *
+conical_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
 {
-    source_image_t *source = (source_image_t *)image;
-    gradient_t *gradient = (gradient_t *)source;
+    pixman_image_t *image = iter->image;
+    int x = iter->x;
+    int y = iter->y;
+    int width = iter->width;
+    uint32_t *buffer = iter->buffer;
+
+    gradient_t *gradient = (gradient_t *)image;
     conical_gradient_t *conical = (conical_gradient_t *)image;
     uint32_t       *end = buffer + width;
     pixman_gradient_walker_t walker;
@@ -71,9 +71,9 @@ conical_gradient_get_scanline_32 (pixman_image_t *image,
     double ry = y + 0.5;
     double rz = 1.;
 
-    _pixman_gradient_walker_init (&walker, gradient, source->common.repeat);
+    _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
 
-    if (source->common.transform)
+    if (image->common.transform)
     {
 	pixman_vector_t v;
 
@@ -82,19 +82,19 @@ conical_gradient_get_scanline_32 (pixman_image_t *image,
 	v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
 	v.vector[2] = pixman_fixed_1;
 
-	if (!pixman_transform_point_3d (source->common.transform, &v))
-	    return;
+	if (!pixman_transform_point_3d (image->common.transform, &v))
+	    return iter->buffer;
 
-	cx = source->common.transform->matrix[0][0] / 65536.;
-	cy = source->common.transform->matrix[1][0] / 65536.;
-	cz = source->common.transform->matrix[2][0] / 65536.;
+	cx = image->common.transform->matrix[0][0] / 65536.;
+	cy = image->common.transform->matrix[1][0] / 65536.;
+	cz = image->common.transform->matrix[2][0] / 65536.;
 
 	rx = v.vector[0] / 65536.;
 	ry = v.vector[1] / 65536.;
 	rz = v.vector[2] / 65536.;
 
 	affine =
-	    source->common.transform->matrix[2][0] == 0 &&
+	    image->common.transform->matrix[2][0] == 0 &&
 	    v.vector[2] == pixman_fixed_1;
     }
 
@@ -155,17 +155,33 @@ conical_gradient_get_scanline_32 (pixman_image_t *image,
 	    rz += cz;
 	}
     }
+
+    iter->y++;
+    return iter->buffer;
 }
 
-static void
-conical_gradient_property_changed (pixman_image_t *image)
+static uint32_t *
+conical_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
 {
-    image->common.get_scanline_32 = conical_gradient_get_scanline_32;
-    image->common.get_scanline_64 = _pixman_image_get_scanline_generic_64;
+    uint32_t *buffer = conical_get_scanline_narrow (iter, NULL);
+
+    pixman_expand_to_float (
+	(argb_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+
+    return buffer;
+}
+
+void
+_pixman_conical_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+    if (iter->iter_flags & ITER_NARROW)
+	iter->get_scanline = conical_get_scanline_narrow;
+    else
+	iter->get_scanline = conical_get_scanline_wide;
 }
 
 PIXMAN_EXPORT pixman_image_t *
-pixman_image_create_conical_gradient (pixman_point_fixed_t *        center,
+pixman_image_create_conical_gradient (const pixman_point_fixed_t *  center,
                                       pixman_fixed_t                angle,
                                       const pixman_gradient_stop_t *stops,
                                       int                           n_stops)
@@ -191,8 +207,6 @@ pixman_image_create_conical_gradient (pixman_point_fixed_t *        center,
     conical->center = *center;
     conical->angle = (pixman_fixed_to_double (angle) / 180.0) * M_PI;
 
-    image->common.property_changed = conical_gradient_property_changed;
-
     return image;
 }
 
diff --git a/programs/develop/libraries/pixman/pixman-cpu.c b/programs/develop/libraries/pixman/pixman-cpu.c
deleted file mode 100644
index e4fb1e4097..0000000000
--- a/programs/develop/libraries/pixman/pixman-cpu.c
+++ /dev/null
@@ -1,598 +0,0 @@
-/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of SuSE not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission.  SuSE makes no representations about the
- * suitability of this software for any purpose.  It is provided "as is"
- * without express or implied warranty.
- *
- * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
- * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <string.h>
-
-#if defined(USE_ARM_SIMD) && defined(_MSC_VER)
-/* Needed for EXCEPTION_ILLEGAL_INSTRUCTION */
-#include <windows.h>
-#endif
-
-#include "pixman-private.h"
-
-#ifdef USE_VMX
-
-/* The CPU detection code needs to be in a file not compiled with
- * "-maltivec -mabi=altivec", as gcc would try to save vector register
- * across function calls causing SIGILL on cpus without Altivec/vmx.
- */
-static pixman_bool_t initialized = FALSE;
-static volatile pixman_bool_t have_vmx = TRUE;
-
-#ifdef __APPLE__
-#include <sys/sysctl.h>
-
-static pixman_bool_t
-pixman_have_vmx (void)
-{
-    if (!initialized)
-    {
-	size_t length = sizeof(have_vmx);
-	int error =
-	    sysctlbyname ("hw.optional.altivec", &have_vmx, &length, NULL, 0);
-
-	if (error)
-	    have_vmx = FALSE;
-
-	initialized = TRUE;
-    }
-    return have_vmx;
-}
-
-#elif defined (__OpenBSD__)
-#include <sys/param.h>
-#include <sys/sysctl.h>
-#include <machine/cpu.h>
-
-static pixman_bool_t
-pixman_have_vmx (void)
-{
-    if (!initialized)
-    {
-	int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC };
-	size_t length = sizeof(have_vmx);
-	int error =
-	    sysctl (mib, 2, &have_vmx, &length, NULL, 0);
-
-	if (error != 0)
-	    have_vmx = FALSE;
-
-	initialized = TRUE;
-    }
-    return have_vmx;
-}
-
-#elif defined (__linux__)
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <stdio.h>
-#include <linux/auxvec.h>
-#include <asm/cputable.h>
-
-static pixman_bool_t
-pixman_have_vmx (void)
-{
-    if (!initialized)
-    {
-	char fname[64];
-	unsigned long buf[64];
-	ssize_t count = 0;
-	pid_t pid;
-	int fd, i;
-
-	pid = getpid ();
-	snprintf (fname, sizeof(fname) - 1, "/proc/%d/auxv", pid);
-
-	fd = open (fname, O_RDONLY);
-	if (fd >= 0)
-	{
-	    for (i = 0; i <= (count / sizeof(unsigned long)); i += 2)
-	    {
-		/* Read more if buf is empty... */
-		if (i == (count / sizeof(unsigned long)))
-		{
-		    count = read (fd, buf, sizeof(buf));
-		    if (count <= 0)
-			break;
-		    i = 0;
-		}
-
-		if (buf[i] == AT_HWCAP)
-		{
-		    have_vmx = !!(buf[i + 1] & PPC_FEATURE_HAS_ALTIVEC);
-		    initialized = TRUE;
-		    break;
-		}
-		else if (buf[i] == AT_NULL)
-		{
-		    break;
-		}
-	    }
-	    close (fd);
-	}
-    }
-    if (!initialized)
-    {
-	/* Something went wrong. Assume 'no' rather than playing
-	   fragile tricks with catching SIGILL. */
-	have_vmx = FALSE;
-	initialized = TRUE;
-    }
-
-    return have_vmx;
-}
-
-#else /* !__APPLE__ && !__OpenBSD__ && !__linux__ */
-#include <signal.h>
-#include <setjmp.h>
-
-static jmp_buf jump_env;
-
-static void
-vmx_test (int        sig,
-	  siginfo_t *si,
-	  void *     unused)
-{
-    longjmp (jump_env, 1);
-}
-
-static pixman_bool_t
-pixman_have_vmx (void)
-{
-    struct sigaction sa, osa;
-    int jmp_result;
-
-    if (!initialized)
-    {
-	sa.sa_flags = SA_SIGINFO;
-	sigemptyset (&sa.sa_mask);
-	sa.sa_sigaction = vmx_test;
-	sigaction (SIGILL, &sa, &osa);
-	jmp_result = setjmp (jump_env);
-	if (jmp_result == 0)
-	{
-	    asm volatile ( "vor 0, 0, 0" );
-	}
-	sigaction (SIGILL, &osa, NULL);
-	have_vmx = (jmp_result == 0);
-	initialized = TRUE;
-    }
-    return have_vmx;
-}
-
-#endif /* __APPLE__ */
-#endif /* USE_VMX */
-
-#if defined(USE_ARM_SIMD) || defined(USE_ARM_NEON)
-
-#if defined(_MSC_VER)
-
-#if defined(USE_ARM_SIMD)
-extern int pixman_msvc_try_arm_simd_op ();
-
-pixman_bool_t
-pixman_have_arm_simd (void)
-{
-    static pixman_bool_t initialized = FALSE;
-    static pixman_bool_t have_arm_simd = FALSE;
-
-    if (!initialized)
-    {
-	__try {
-	    pixman_msvc_try_arm_simd_op ();
-	    have_arm_simd = TRUE;
-	} __except (GetExceptionCode () == EXCEPTION_ILLEGAL_INSTRUCTION) {
-	    have_arm_simd = FALSE;
-	}
-	initialized = TRUE;
-    }
-
-    return have_arm_simd;
-}
-
-#endif /* USE_ARM_SIMD */
-
-#if defined(USE_ARM_NEON)
-extern int pixman_msvc_try_arm_neon_op ();
-
-pixman_bool_t
-pixman_have_arm_neon (void)
-{
-    static pixman_bool_t initialized = FALSE;
-    static pixman_bool_t have_arm_neon = FALSE;
-
-    if (!initialized)
-    {
-	__try
-	{
-	    pixman_msvc_try_arm_neon_op ();
-	    have_arm_neon = TRUE;
-	}
-	__except (GetExceptionCode () == EXCEPTION_ILLEGAL_INSTRUCTION)
-	{
-	    have_arm_neon = FALSE;
-	}
-	initialized = TRUE;
-    }
-
-    return have_arm_neon;
-}
-
-#endif /* USE_ARM_NEON */
-
-#else /* linux ELF */
-
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-#include <fcntl.h>
-#include <string.h>
-#include <elf.h>
-
-static pixman_bool_t arm_has_v7 = FALSE;
-static pixman_bool_t arm_has_v6 = FALSE;
-static pixman_bool_t arm_has_vfp = FALSE;
-static pixman_bool_t arm_has_neon = FALSE;
-static pixman_bool_t arm_has_iwmmxt = FALSE;
-static pixman_bool_t arm_tests_initialized = FALSE;
-
-static void
-pixman_arm_read_auxv ()
-{
-    int fd;
-    Elf32_auxv_t aux;
-
-    fd = open ("/proc/self/auxv", O_RDONLY);
-    if (fd >= 0)
-    {
-	while (read (fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t))
-	{
-	    if (aux.a_type == AT_HWCAP)
-	    {
-		uint32_t hwcap = aux.a_un.a_val;
-		/* hardcode these values to avoid depending on specific
-		 * versions of the hwcap header, e.g. HWCAP_NEON
-		 */
-		arm_has_vfp = (hwcap & 64) != 0;
-		arm_has_iwmmxt = (hwcap & 512) != 0;
-		/* this flag is only present on kernel 2.6.29 */
-		arm_has_neon = (hwcap & 4096) != 0;
-	    }
-	    else if (aux.a_type == AT_PLATFORM)
-	    {
-		const char *plat = (const char*) aux.a_un.a_val;
-		if (strncmp (plat, "v7l", 3) == 0)
-		{
-		    arm_has_v7 = TRUE;
-		    arm_has_v6 = TRUE;
-		}
-		else if (strncmp (plat, "v6l", 3) == 0)
-		{
-		    arm_has_v6 = TRUE;
-		}
-	    }
-	}
-	close (fd);
-    }
-
-    arm_tests_initialized = TRUE;
-}
-
-#if defined(USE_ARM_SIMD)
-pixman_bool_t
-pixman_have_arm_simd (void)
-{
-    if (!arm_tests_initialized)
-	pixman_arm_read_auxv ();
-
-    return arm_has_v6;
-}
-
-#endif /* USE_ARM_SIMD */
-
-#if defined(USE_ARM_NEON)
-pixman_bool_t
-pixman_have_arm_neon (void)
-{
-    if (!arm_tests_initialized)
-	pixman_arm_read_auxv ();
-
-    return arm_has_neon;
-}
-
-#endif /* USE_ARM_NEON */
-
-#endif /* linux */
-
-#endif /* USE_ARM_SIMD || USE_ARM_NEON */
-
-#if defined(USE_MMX) || defined(USE_SSE2)
-/* The CPU detection code needs to be in a file not compiled with
- * "-mmmx -msse", as gcc would generate CMOV instructions otherwise
- * that would lead to SIGILL instructions on old CPUs that don't have
- * it.
- */
-#if !defined(__amd64__) && !defined(__x86_64__) && !defined(_M_AMD64)
-
-#ifdef HAVE_GETISAX
-#include <sys/auxv.h>
-#endif
-
-typedef enum
-{
-    NO_FEATURES = 0,
-    MMX = 0x1,
-    MMX_EXTENSIONS = 0x2,
-    SSE = 0x6,
-    SSE2 = 0x8,
-    CMOV = 0x10
-} cpu_features_t;
-
-
-static unsigned int
-detect_cpu_features (void)
-{
-    unsigned int features = 0;
-    unsigned int result = 0;
-
-#ifdef HAVE_GETISAX
-    if (getisax (&result, 1))
-    {
-	if (result & AV_386_CMOV)
-	    features |= CMOV;
-	if (result & AV_386_MMX)
-	    features |= MMX;
-	if (result & AV_386_AMD_MMX)
-	    features |= MMX_EXTENSIONS;
-	if (result & AV_386_SSE)
-	    features |= SSE;
-	if (result & AV_386_SSE2)
-	    features |= SSE2;
-    }
-#else
-    char vendor[13];
-#ifdef _MSC_VER
-    int vendor0 = 0, vendor1, vendor2;
-#endif
-    vendor[0] = 0;
-    vendor[12] = 0;
-
-#ifdef __GNUC__
-    /* see p. 118 of amd64 instruction set manual Vol3 */
-    /* We need to be careful about the handling of %ebx and
-     * %esp here. We can't declare either one as clobbered
-     * since they are special registers (%ebx is the "PIC
-     * register" holding an offset to global data, %esp the
-     * stack pointer), so we need to make sure they have their
-     * original values when we access the output operands.
-     */
-    __asm__ (
-        "pushf\n"
-        "pop %%eax\n"
-        "mov %%eax, %%ecx\n"
-        "xor $0x00200000, %%eax\n"
-        "push %%eax\n"
-        "popf\n"
-        "pushf\n"
-        "pop %%eax\n"
-        "mov $0x0, %%edx\n"
-        "xor %%ecx, %%eax\n"
-        "jz 1f\n"
-
-        "mov $0x00000000, %%eax\n"
-        "push %%ebx\n"
-        "cpuid\n"
-        "mov %%ebx, %%eax\n"
-        "pop %%ebx\n"
-        "mov %%eax, %1\n"
-        "mov %%edx, %2\n"
-        "mov %%ecx, %3\n"
-        "mov $0x00000001, %%eax\n"
-        "push %%ebx\n"
-        "cpuid\n"
-        "pop %%ebx\n"
-        "1:\n"
-        "mov %%edx, %0\n"
-	: "=r" (result),
-        "=m" (vendor[0]),
-        "=m" (vendor[4]),
-        "=m" (vendor[8])
-	:
-	: "%eax", "%ecx", "%edx"
-        );
-
-#elif defined (_MSC_VER)
-
-    _asm {
-	pushfd
-	pop eax
-	mov ecx, eax
-	xor eax, 00200000h
-	push eax
-	popfd
-	pushfd
-	pop eax
-	mov edx, 0
-	xor eax, ecx
-	jz nocpuid
-
-	mov eax, 0
-	push ebx
-	cpuid
-	mov eax, ebx
-	pop ebx
-	mov vendor0, eax
-	mov vendor1, edx
-	mov vendor2, ecx
-	mov eax, 1
-	push ebx
-	cpuid
-	pop ebx
-    nocpuid:
-	mov result, edx
-    }
-    memmove (vendor + 0, &vendor0, 4);
-    memmove (vendor + 4, &vendor1, 4);
-    memmove (vendor + 8, &vendor2, 4);
-
-#else
-#   error unsupported compiler
-#endif
-
-    features = 0;
-    if (result)
-    {
-	/* result now contains the standard feature bits */
-	if (result & (1 << 15))
-	    features |= CMOV;
-	if (result & (1 << 23))
-	    features |= MMX;
-	if (result & (1 << 25))
-	    features |= SSE;
-	if (result & (1 << 26))
-	    features |= SSE2;
-	if ((features & MMX) && !(features & SSE) &&
-	    (strcmp (vendor, "AuthenticAMD") == 0 ||
-	     strcmp (vendor, "Geode by NSC") == 0))
-	{
-	    /* check for AMD MMX extensions */
-#ifdef __GNUC__
-	    __asm__ (
-	        "	push %%ebx\n"
-	        "	mov $0x80000000, %%eax\n"
-	        "	cpuid\n"
-	        "	xor %%edx, %%edx\n"
-	        "	cmp $0x1, %%eax\n"
-	        "	jge 2f\n"
-	        "	mov $0x80000001, %%eax\n"
-	        "	cpuid\n"
-	        "2:\n"
-	        "	pop %%ebx\n"
-	        "	mov %%edx, %0\n"
-		: "=r" (result)
-		:
-		: "%eax", "%ecx", "%edx"
-	        );
-#elif defined _MSC_VER
-	    _asm {
-		push ebx
-		mov eax, 80000000h
-		cpuid
-		xor edx, edx
-		cmp eax, 1
-		jge notamd
-		mov eax, 80000001h
-		cpuid
-	    notamd:
-		pop ebx
-		mov result, edx
-	    }
-#endif
-	    if (result & (1 << 22))
-		features |= MMX_EXTENSIONS;
-	}
-    }
-#endif /* HAVE_GETISAX */
-
-    return features;
-}
-
-static pixman_bool_t
-pixman_have_mmx (void)
-{
-    static pixman_bool_t initialized = FALSE;
-    static pixman_bool_t mmx_present;
-
-    if (!initialized)
-    {
-	unsigned int features = detect_cpu_features ();
-	mmx_present = (features & (MMX | MMX_EXTENSIONS)) == (MMX | MMX_EXTENSIONS);
-	initialized = TRUE;
-    }
-
-    return mmx_present;
-}
-
-#ifdef USE_SSE2
-static pixman_bool_t
-pixman_have_sse2 (void)
-{
-    static pixman_bool_t initialized = FALSE;
-    static pixman_bool_t sse2_present;
-
-    if (!initialized)
-    {
-	unsigned int features = detect_cpu_features ();
-	sse2_present = (features & (MMX | MMX_EXTENSIONS | SSE | SSE2)) == (MMX | MMX_EXTENSIONS | SSE | SSE2);
-	initialized = TRUE;
-    }
-
-    return sse2_present;
-}
-
-#endif
-
-#else /* __amd64__ */
-#ifdef USE_MMX
-#define pixman_have_mmx() TRUE
-#endif
-#ifdef USE_SSE2
-#define pixman_have_sse2() TRUE
-#endif
-#endif /* __amd64__ */
-#endif
-
-pixman_implementation_t *
-_pixman_choose_implementation (void)
-{
-#ifdef USE_SSE2
-    if (pixman_have_sse2 ())
-	return _pixman_implementation_create_sse2 ();
-#endif
-#ifdef USE_MMX
-    if (pixman_have_mmx ())
-	return _pixman_implementation_create_mmx ();
-#endif
-
-#ifdef USE_ARM_NEON
-    if (pixman_have_arm_neon ())
-	return _pixman_implementation_create_arm_neon ();
-#endif
-#ifdef USE_ARM_SIMD
-    if (pixman_have_arm_simd ())
-	return _pixman_implementation_create_arm_simd ();
-#endif
-#ifdef USE_VMX
-    if (pixman_have_vmx ())
-	return _pixman_implementation_create_vmx ();
-#endif
-
-    return _pixman_implementation_create_fast_path ();
-}
-
diff --git a/programs/develop/libraries/pixman/pixman-edge.c b/programs/develop/libraries/pixman/pixman-edge.c
index 8d498ab445..ad6dfc4cfa 100644
--- a/programs/develop/libraries/pixman/pixman-edge.c
+++ b/programs/develop/libraries/pixman/pixman-edge.c
@@ -374,6 +374,7 @@ pixman_rasterize_edges (pixman_image_t *image,
                         pixman_fixed_t  b)
 {
     return_if_fail (image->type == BITS);
+    return_if_fail (PIXMAN_FORMAT_TYPE (image->bits.format) == PIXMAN_TYPE_A);
     
     if (image->bits.read_func || image->bits.write_func)
 	pixman_rasterize_edges_accessors (image, l, r, t, b);
diff --git a/programs/develop/libraries/pixman/pixman-fast-path.c b/programs/develop/libraries/pixman/pixman-fast-path.c
index 5d5fa956c8..247aea6450 100644
--- a/programs/develop/libraries/pixman/pixman-fast-path.c
+++ b/programs/develop/libraries/pixman/pixman-fast-path.c
@@ -30,12 +30,12 @@
 #include <stdlib.h>
 #include "pixman-private.h"
 #include "pixman-combine32.h"
-#include "pixman-fast-path.h"
+#include "pixman-inlines.h"
 
 static force_inline uint32_t
 fetch_24 (uint8_t *a)
 {
-    if (((unsigned long)a) & 1)
+    if (((uintptr_t)a) & 1)
     {
 #ifdef WORDS_BIGENDIAN
 	return (*a << 16) | (*(uint16_t *)(a + 1));
@@ -57,7 +57,7 @@ static force_inline void
 store_24 (uint8_t *a,
           uint32_t v)
 {
-    if (((unsigned long)a) & 1)
+    if (((uintptr_t)a) & 1)
     {
 #ifdef WORDS_BIGENDIAN
 	*a = (uint8_t) (v >> 16);
@@ -90,7 +90,7 @@ over (uint32_t src,
     return dest;
 }
 
-static uint32_t
+static force_inline uint32_t
 in (uint32_t x,
     uint8_t  y)
 {
@@ -108,19 +108,9 @@ in (uint32_t x,
  */
 static void
 fast_composite_over_x888_8_8888 (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 pixman_image_t *         src_image,
-                                 pixman_image_t *         mask_image,
-                                 pixman_image_t *         dst_image,
-                                 int32_t                  src_x,
-                                 int32_t                  src_y,
-                                 int32_t                  mask_x,
-                                 int32_t                  mask_y,
-                                 int32_t                  dest_x,
-                                 int32_t                  dest_y,
-                                 int32_t                  width,
-                                 int32_t                  height)
+                                 pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t    *src, *src_line;
     uint32_t    *dst, *dst_line;
     uint8_t     *mask, *mask_line;
@@ -129,7 +119,7 @@ fast_composite_over_x888_8_8888 (pixman_implementation_t *imp,
     uint32_t s, d;
     int32_t w;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
@@ -168,19 +158,9 @@ fast_composite_over_x888_8_8888 (pixman_implementation_t *imp,
 
 static void
 fast_composite_in_n_8_8 (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         pixman_image_t *         src_image,
-                         pixman_image_t *         mask_image,
-                         pixman_image_t *         dest_image,
-                         int32_t                  src_x,
-                         int32_t                  src_y,
-                         int32_t                  mask_x,
-                         int32_t                  mask_y,
-                         int32_t                  dest_x,
-                         int32_t                  dest_y,
-                         int32_t                  width,
-                         int32_t                  height)
+                         pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t src, srca;
     uint8_t     *dst_line, *dst;
     uint8_t     *mask_line, *mask, m;
@@ -188,7 +168,7 @@ fast_composite_in_n_8_8 (pixman_implementation_t *imp,
     int32_t w;
     uint16_t t;
 
-    src = _pixman_image_get_solid (src_image, dest_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     srca = src >> 24;
 
@@ -246,19 +226,9 @@ fast_composite_in_n_8_8 (pixman_implementation_t *imp,
 
 static void
 fast_composite_in_8_8 (pixman_implementation_t *imp,
-                       pixman_op_t              op,
-                       pixman_image_t *         src_image,
-                       pixman_image_t *         mask_image,
-                       pixman_image_t *         dest_image,
-                       int32_t                  src_x,
-                       int32_t                  src_y,
-                       int32_t                  mask_x,
-                       int32_t                  mask_y,
-                       int32_t                  dest_x,
-                       int32_t                  dest_y,
-                       int32_t                  width,
-                       int32_t                  height)
+                       pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint8_t     *dst_line, *dst;
     uint8_t     *src_line, *src;
     int dst_stride, src_stride;
@@ -293,32 +263,22 @@ fast_composite_in_8_8 (pixman_implementation_t *imp,
 
 static void
 fast_composite_over_n_8_8888 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
+                              pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t src, srca;
     uint32_t    *dst_line, *dst, d;
     uint8_t     *mask_line, *mask, m;
     int dst_stride, mask_stride;
     int32_t w;
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
 	return;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
     while (height--)
@@ -351,32 +311,21 @@ fast_composite_over_n_8_8888 (pixman_implementation_t *imp,
 
 static void
 fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
-				   pixman_op_t              op,
-				   pixman_image_t *         src_image,
-				   pixman_image_t *         mask_image,
-				   pixman_image_t *         dst_image,
-				   int32_t                  src_x,
-				   int32_t                  src_y,
-				   int32_t                  mask_x,
-				   int32_t                  mask_y,
-				   int32_t                  dest_x,
-				   int32_t                  dest_y,
-				   int32_t                  width,
-				   int32_t                  height)
+				   pixman_composite_info_t *info)
 {
-    uint32_t src, srca, s;
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, s;
     uint32_t    *dst_line, *dst, d;
     uint32_t    *mask_line, *mask, ma;
     int dst_stride, mask_stride;
     int32_t w;
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
-    srca = src >> 24;
     if (src == 0)
 	return;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
 
     while (height--)
@@ -408,32 +357,22 @@ fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
 
 static void
 fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
-                                    pixman_op_t              op,
-                                    pixman_image_t *         src_image,
-                                    pixman_image_t *         mask_image,
-                                    pixman_image_t *         dst_image,
-                                    int32_t                  src_x,
-                                    int32_t                  src_y,
-                                    int32_t                  mask_x,
-                                    int32_t                  mask_y,
-                                    int32_t                  dest_x,
-                                    int32_t                  dest_y,
-                                    int32_t                  width,
-                                    int32_t                  height)
+                                    pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t src, srca, s;
     uint32_t    *dst_line, *dst, d;
     uint32_t    *mask_line, *mask, ma;
     int dst_stride, mask_stride;
     int32_t w;
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
 	return;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
 
     while (height--)
@@ -474,19 +413,9 @@ fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 
 static void
 fast_composite_over_n_8_0888 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
+                              pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t src, srca;
     uint8_t     *dst_line, *dst;
     uint32_t d;
@@ -494,13 +423,13 @@ fast_composite_over_n_8_0888 (pixman_implementation_t *imp,
     int dst_stride, mask_stride;
     int32_t w;
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
 	return;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
     while (height--)
@@ -539,19 +468,9 @@ fast_composite_over_n_8_0888 (pixman_implementation_t *imp,
 
 static void
 fast_composite_over_n_8_0565 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
+                              pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t src, srca;
     uint16_t    *dst_line, *dst;
     uint32_t d;
@@ -559,13 +478,13 @@ fast_composite_over_n_8_0565 (pixman_implementation_t *imp,
     int dst_stride, mask_stride;
     int32_t w;
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
 	return;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
     while (height--)
@@ -588,15 +507,15 @@ fast_composite_over_n_8_0565 (pixman_implementation_t *imp,
 		else
 		{
 		    d = *dst;
-		    d = over (src, CONVERT_0565_TO_0888 (d));
+		    d = over (src, convert_0565_to_0888 (d));
 		}
-		*dst = CONVERT_8888_TO_0565 (d);
+		*dst = convert_8888_to_0565 (d);
 	    }
 	    else if (m)
 	    {
 		d = *dst;
-		d = over (in (src, m), CONVERT_0565_TO_0888 (d));
-		*dst = CONVERT_8888_TO_0565 (d);
+		d = over (in (src, m), convert_0565_to_0888 (d));
+		*dst = convert_8888_to_0565 (d);
 	    }
 	    dst++;
 	}
@@ -605,19 +524,9 @@ fast_composite_over_n_8_0565 (pixman_implementation_t *imp,
 
 static void
 fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
-                                    pixman_op_t              op,
-                                    pixman_image_t *         src_image,
-                                    pixman_image_t *         mask_image,
-                                    pixman_image_t *         dst_image,
-                                    int32_t                  src_x,
-                                    int32_t                  src_y,
-                                    int32_t                  mask_x,
-                                    int32_t                  mask_y,
-                                    int32_t                  dest_x,
-                                    int32_t                  dest_y,
-                                    int32_t                  width,
-                                    int32_t                  height)
+                                    pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t  src, srca, s;
     uint16_t  src16;
     uint16_t *dst_line, *dst;
@@ -626,15 +535,15 @@ fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
     int dst_stride, mask_stride;
     int32_t w;
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
 	return;
 
-    src16 = CONVERT_8888_TO_0565 (src);
+    src16 = convert_8888_to_0565 (src);
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
 
     while (height--)
@@ -657,14 +566,14 @@ fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 		else
 		{
 		    d = *dst;
-		    d = over (src, CONVERT_0565_TO_0888 (d));
-		    *dst = CONVERT_8888_TO_0565 (d);
+		    d = over (src, convert_0565_to_0888 (d));
+		    *dst = convert_8888_to_0565 (d);
 		}
 	    }
 	    else if (ma)
 	    {
 		d = *dst;
-		d = CONVERT_0565_TO_0888 (d);
+		d = convert_0565_to_0888 (d);
 
 		s = src;
 
@@ -673,7 +582,7 @@ fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 		ma = ~ma;
 		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
 
-		*dst = CONVERT_8888_TO_0565 (d);
+		*dst = convert_8888_to_0565 (d);
 	    }
 	    dst++;
 	}
@@ -682,26 +591,16 @@ fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 
 static void
 fast_composite_over_8888_8888 (pixman_implementation_t *imp,
-                               pixman_op_t              op,
-                               pixman_image_t *         src_image,
-                               pixman_image_t *         mask_image,
-                               pixman_image_t *         dst_image,
-                               int32_t                  src_x,
-                               int32_t                  src_y,
-                               int32_t                  mask_x,
-                               int32_t                  mask_y,
-                               int32_t                  dest_x,
-                               int32_t                  dest_y,
-                               int32_t                  width,
-                               int32_t                  height)
+                               pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t    *dst_line, *dst;
     uint32_t    *src_line, *src, s;
     int dst_stride, src_stride;
     uint8_t a;
     int32_t w;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
     while (height--)
@@ -727,25 +626,15 @@ fast_composite_over_8888_8888 (pixman_implementation_t *imp,
 
 static void
 fast_composite_src_x888_8888 (pixman_implementation_t *imp,
-			      pixman_op_t              op,
-			      pixman_image_t *         src_image,
-			      pixman_image_t *         mask_image,
-			      pixman_image_t *         dst_image,
-			      int32_t                  src_x,
-			      int32_t                  src_y,
-			      int32_t                  mask_x,
-			      int32_t                  mask_y,
-			      int32_t                  dest_x,
-			      int32_t                  dest_y,
-			      int32_t                  width,
-			      int32_t                  height)
+			      pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t    *dst_line, *dst;
     uint32_t    *src_line, *src;
     int dst_stride, src_stride;
     int32_t w;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
     while (height--)
@@ -764,19 +653,9 @@ fast_composite_src_x888_8888 (pixman_implementation_t *imp,
 #if 0
 static void
 fast_composite_over_8888_0888 (pixman_implementation_t *imp,
-			       pixman_op_t              op,
-			       pixman_image_t *         src_image,
-			       pixman_image_t *         mask_image,
-			       pixman_image_t *         dst_image,
-			       int32_t                  src_x,
-			       int32_t                  src_y,
-			       int32_t                  mask_x,
-			       int32_t                  mask_y,
-			       int32_t                  dest_x,
-			       int32_t                  dest_y,
-			       int32_t                  width,
-			       int32_t                  height)
+			       pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint8_t     *dst_line, *dst;
     uint32_t d;
     uint32_t    *src_line, *src, s;
@@ -784,7 +663,7 @@ fast_composite_over_8888_0888 (pixman_implementation_t *imp,
     int dst_stride, src_stride;
     int32_t w;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
     while (height--)
@@ -816,19 +695,9 @@ fast_composite_over_8888_0888 (pixman_implementation_t *imp,
 
 static void
 fast_composite_over_8888_0565 (pixman_implementation_t *imp,
-                               pixman_op_t              op,
-                               pixman_image_t *         src_image,
-                               pixman_image_t *         mask_image,
-                               pixman_image_t *         dst_image,
-                               int32_t                  src_x,
-                               int32_t                  src_y,
-                               int32_t                  mask_x,
-                               int32_t                  mask_y,
-                               int32_t                  dest_x,
-                               int32_t                  dest_y,
-                               int32_t                  width,
-                               int32_t                  height)
+                               pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint16_t    *dst_line, *dst;
     uint32_t d;
     uint32_t    *src_line, *src, s;
@@ -837,7 +706,7 @@ fast_composite_over_8888_0565 (pixman_implementation_t *imp,
     int32_t w;
 
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
@@ -860,70 +729,20 @@ fast_composite_over_8888_0565 (pixman_implementation_t *imp,
 		else
 		{
 		    d = *dst;
-		    d = over (s, CONVERT_0565_TO_0888 (d));
+		    d = over (s, convert_0565_to_0888 (d));
 		}
-		*dst = CONVERT_8888_TO_0565 (d);
+		*dst = convert_8888_to_0565 (d);
 	    }
 	    dst++;
 	}
     }
 }
 
-static void
-fast_composite_src_x888_0565 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
-{
-    uint16_t    *dst_line, *dst;
-    uint32_t    *src_line, *src, s;
-    int dst_stride, src_stride;
-    int32_t w;
-
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w--)
-	{
-	    s = *src++;
-	    *dst = CONVERT_8888_TO_0565 (s);
-	    dst++;
-	}
-    }
-}
-
 static void
 fast_composite_add_8_8 (pixman_implementation_t *imp,
-			pixman_op_t              op,
-			pixman_image_t *         src_image,
-			pixman_image_t *         mask_image,
-			pixman_image_t *         dst_image,
-			int32_t                  src_x,
-			int32_t                  src_y,
-			int32_t                  mask_x,
-			int32_t                  mask_y,
-			int32_t                  dest_x,
-			int32_t                  dest_y,
-			int32_t                  width,
-			int32_t                  height)
+			pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint8_t     *dst_line, *dst;
     uint8_t     *src_line, *src;
     int dst_stride, src_stride;
@@ -932,7 +751,7 @@ fast_composite_add_8_8 (pixman_implementation_t *imp,
     uint16_t t;
 
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
@@ -961,20 +780,52 @@ fast_composite_add_8_8 (pixman_implementation_t *imp,
 }
 
 static void
-fast_composite_add_8888_8888 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
+fast_composite_add_0565_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t	d;
+    uint16_t    *src_line, *src;
+    uint32_t	s;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    if (s)
+	    {
+		d = *dst;
+		s = convert_0565_to_8888 (s);
+		if (d)
+		{
+		    d = convert_0565_to_8888 (d);
+		    UN8x4_ADD_UN8x4 (s, d);
+		}
+		*dst = convert_8888_to_0565 (s);
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_add_8888_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t    *dst_line, *dst;
     uint32_t    *src_line, *src;
     int dst_stride, src_stride;
@@ -982,7 +833,7 @@ fast_composite_add_8888_8888 (pixman_implementation_t *imp,
     uint32_t s, d;
 
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
@@ -1012,19 +863,9 @@ fast_composite_add_8888_8888 (pixman_implementation_t *imp,
 
 static void
 fast_composite_add_n_8_8 (pixman_implementation_t *imp,
-			  pixman_op_t              op,
-			  pixman_image_t *         src_image,
-			  pixman_image_t *         mask_image,
-			  pixman_image_t *         dst_image,
-			  int32_t                  src_x,
-			  int32_t                  src_y,
-			  int32_t                  mask_x,
-			  int32_t                  mask_y,
-			  int32_t                  dest_x,
-			  int32_t                  dest_y,
-			  int32_t                  width,
-			  int32_t                  height)
+			  pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint8_t     *dst_line, *dst;
     uint8_t     *mask_line, *mask;
     int dst_stride, mask_stride;
@@ -1032,9 +873,9 @@ fast_composite_add_n_8_8 (pixman_implementation_t *imp,
     uint32_t src;
     uint8_t sa;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
     sa = (src >> 24);
 
     while (height--)
@@ -1077,20 +918,10 @@ fast_composite_add_n_8_8 (pixman_implementation_t *imp,
     do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0);
 
 static void
-fast_composite_add_1000_1000 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
+fast_composite_add_1_1 (pixman_implementation_t *imp,
+			pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t     *dst_line, *dst;
     uint32_t     *src_line, *src;
     int           dst_stride, src_stride;
@@ -1098,7 +929,7 @@ fast_composite_add_1000_1000 (pixman_implementation_t *imp,
 
     PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t,
                            src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dst_image, 0, dest_y, uint32_t,
+    PIXMAN_IMAGE_GET_LINE (dest_image, 0, dest_y, uint32_t,
                            dst_stride, dst_line, 1);
 
     while (height--)
@@ -1123,19 +954,9 @@ fast_composite_add_1000_1000 (pixman_implementation_t *imp,
 
 static void
 fast_composite_over_n_1_8888 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
+                              pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t     src, srca;
     uint32_t    *dst, *dst_line;
     uint32_t    *mask, *mask_line;
@@ -1146,12 +967,12 @@ fast_composite_over_n_1_8888 (pixman_implementation_t *imp,
     if (width <= 0)
 	return;
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
     srca = src >> 24;
     if (src == 0)
 	return;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t,
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t,
                            dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
                            mask_stride, mask_line, 1);
@@ -1215,19 +1036,9 @@ fast_composite_over_n_1_8888 (pixman_implementation_t *imp,
 
 static void
 fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
+                              pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t     src, srca;
     uint16_t    *dst, *dst_line;
     uint32_t    *mask, *mask_line;
@@ -1240,12 +1051,12 @@ fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
     if (width <= 0)
 	return;
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
     srca = src >> 24;
     if (src == 0)
 	return;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t,
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t,
                            dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
                            mask_stride, mask_line, 1);
@@ -1253,7 +1064,7 @@ fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
 
     if (srca == 0xff)
     {
-	src565 = CONVERT_8888_TO_0565 (src);
+	src565 = convert_8888_to_0565 (src);
 	while (height--)
 	{
 	    dst = dst_line;
@@ -1301,8 +1112,8 @@ fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
 		}
 		if (bitcache & bitmask)
 		{
-		    d = over (src, CONVERT_0565_TO_0888 (*dst));
-		    *dst = CONVERT_8888_TO_0565 (d);
+		    d = over (src, convert_0565_to_0888 (*dst));
+		    *dst = convert_8888_to_0565 (d);
 		}
 		bitmask = UPDATE_BITMASK (bitmask);
 		dst++;
@@ -1317,35 +1128,29 @@ fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
 
 static void
 fast_composite_solid_fill (pixman_implementation_t *imp,
-                           pixman_op_t              op,
-                           pixman_image_t *         src_image,
-                           pixman_image_t *         mask_image,
-                           pixman_image_t *         dst_image,
-                           int32_t                  src_x,
-                           int32_t                  src_y,
-                           int32_t                  mask_x,
-                           int32_t                  mask_y,
-                           int32_t                  dest_x,
-                           int32_t                  dest_y,
-                           int32_t                  width,
-                           int32_t                  height)
+                           pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t src;
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
-    if (dst_image->bits.format == PIXMAN_a8)
+    if (dest_image->bits.format == PIXMAN_a1)
+    {
+	src = src >> 31;
+    }
+    else if (dest_image->bits.format == PIXMAN_a8)
     {
 	src = src >> 24;
     }
-    else if (dst_image->bits.format == PIXMAN_r5g6b5 ||
-             dst_image->bits.format == PIXMAN_b5g6r5)
+    else if (dest_image->bits.format == PIXMAN_r5g6b5 ||
+             dest_image->bits.format == PIXMAN_b5g6r5)
     {
-	src = CONVERT_8888_TO_0565 (src);
+	src = convert_8888_to_0565 (src);
     }
 
-    pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
-                 PIXMAN_FORMAT_BPP (dst_image->bits.format),
+    pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
+                 PIXMAN_FORMAT_BPP (dest_image->bits.format),
                  dest_x, dest_y,
                  width, height,
                  src);
@@ -1353,30 +1158,20 @@ fast_composite_solid_fill (pixman_implementation_t *imp,
 
 static void
 fast_composite_src_memcpy (pixman_implementation_t *imp,
-			   pixman_op_t              op,
-			   pixman_image_t *         src_image,
-			   pixman_image_t *         mask_image,
-			   pixman_image_t *         dst_image,
-			   int32_t                  src_x,
-			   int32_t                  src_y,
-			   int32_t                  mask_x,
-			   int32_t                  mask_y,
-			   int32_t                  dest_x,
-			   int32_t                  dest_y,
-			   int32_t                  width,
-			   int32_t                  height)
+			   pixman_composite_info_t *info)
 {
-    int bpp = PIXMAN_FORMAT_BPP (dst_image->bits.format) / 8;
+    PIXMAN_COMPOSITE_ARGS (info);
+    int bpp = PIXMAN_FORMAT_BPP (dest_image->bits.format) / 8;
     uint32_t n_bytes = width * bpp;
     int dst_stride, src_stride;
     uint8_t    *dst;
     uint8_t    *src;
 
     src_stride = src_image->bits.rowstride * 4;
-    dst_stride = dst_image->bits.rowstride * 4;
+    dst_stride = dest_image->bits.rowstride * 4;
 
     src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp;
-    dst = (uint8_t *)dst_image->bits.bits + dest_y * dst_stride + dest_x * bpp;
+    dst = (uint8_t *)dest_image->bits.bits + dest_y * dst_stride + dest_x * bpp;
 
     while (height--)
     {
@@ -1387,43 +1182,211 @@ fast_composite_src_memcpy (pixman_implementation_t *imp,
     }
 }
 
-FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER);
-FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE);
-FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD);
-FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL);
-FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER);
-FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE);
-FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD);
-FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL);
-FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER);
-FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE);
-FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD);
-FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL);
-FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL);
-FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER);
-FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE);
-FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD);
-FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL);
+FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER)
+FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE)
+FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD)
+FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL)
+FAST_NEAREST (x888_8888_cover, x888, 8888, uint32_t, uint32_t, SRC, COVER)
+FAST_NEAREST (x888_8888_pad, x888, 8888, uint32_t, uint32_t, SRC, PAD)
+FAST_NEAREST (x888_8888_normal, x888, 8888, uint32_t, uint32_t, SRC, NORMAL)
+FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER)
+FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE)
+FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD)
+FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL)
+FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER)
+FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE)
+FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD)
+FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL)
+FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL)
+FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER)
+FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE)
+FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD)
+FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL)
+
+#define REPEAT_MIN_WIDTH    32
+
+static void
+fast_composite_tiled_repeat (pixman_implementation_t *imp,
+			     pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    pixman_composite_func_t func;
+    pixman_format_code_t mask_format;
+    uint32_t src_flags, mask_flags;
+    int32_t sx, sy;
+    int32_t width_remain;
+    int32_t num_pixels;
+    int32_t src_width;
+    int32_t i, j;
+    pixman_image_t extended_src_image;
+    uint32_t extended_src[REPEAT_MIN_WIDTH * 2];
+    pixman_bool_t need_src_extension;
+    uint32_t *src_line;
+    int32_t src_stride;
+    int32_t src_bpp;
+    pixman_composite_info_t info2 = *info;
+
+    src_flags = (info->src_flags & ~FAST_PATH_NORMAL_REPEAT) |
+		    FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
+
+    if (mask_image)
+    {
+	mask_format = mask_image->common.extended_format_code;
+	mask_flags = info->mask_flags;
+    }
+    else
+    {
+	mask_format = PIXMAN_null;
+	mask_flags = FAST_PATH_IS_OPAQUE;
+    }
+
+    _pixman_implementation_lookup_composite (
+	imp->toplevel, info->op,
+	src_image->common.extended_format_code, src_flags,
+	mask_format, mask_flags,
+	dest_image->common.extended_format_code, info->dest_flags,
+	&imp, &func);
+
+    src_bpp = PIXMAN_FORMAT_BPP (src_image->bits.format);
+
+    if (src_image->bits.width < REPEAT_MIN_WIDTH		&&
+	(src_bpp == 32 || src_bpp == 16 || src_bpp == 8)	&&
+	!src_image->bits.indexed)
+    {
+	sx = src_x;
+	sx = MOD (sx, src_image->bits.width);
+	sx += width;
+	src_width = 0;
+
+	while (src_width < REPEAT_MIN_WIDTH && src_width <= sx)
+	    src_width += src_image->bits.width;
+
+	src_stride = (src_width * (src_bpp >> 3) + 3) / (int) sizeof (uint32_t);
+
+	/* Initialize/validate stack-allocated temporary image */
+	_pixman_bits_image_init (&extended_src_image, src_image->bits.format,
+				 src_width, 1, &extended_src[0], src_stride,
+				 FALSE);
+	_pixman_image_validate (&extended_src_image);
+
+	info2.src_image = &extended_src_image;
+	need_src_extension = TRUE;
+    }
+    else
+    {
+	src_width = src_image->bits.width;
+	need_src_extension = FALSE;
+    }
+
+    sx = src_x;
+    sy = src_y;
+
+    while (--height >= 0)
+    {
+	sx = MOD (sx, src_width);
+	sy = MOD (sy, src_image->bits.height);
+
+	if (need_src_extension)
+	{
+	    if (src_bpp == 32)
+	    {
+		PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint32_t, src_stride, src_line, 1);
+
+		for (i = 0; i < src_width; )
+		{
+		    for (j = 0; j < src_image->bits.width; j++, i++)
+			extended_src[i] = src_line[j];
+		}
+	    }
+	    else if (src_bpp == 16)
+	    {
+		uint16_t *src_line_16;
+
+		PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint16_t, src_stride,
+				       src_line_16, 1);
+		src_line = (uint32_t*)src_line_16;
+
+		for (i = 0; i < src_width; )
+		{
+		    for (j = 0; j < src_image->bits.width; j++, i++)
+			((uint16_t*)extended_src)[i] = ((uint16_t*)src_line)[j];
+		}
+	    }
+	    else if (src_bpp == 8)
+	    {
+		uint8_t *src_line_8;
+
+		PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint8_t, src_stride,
+				       src_line_8, 1);
+		src_line = (uint32_t*)src_line_8;
+
+		for (i = 0; i < src_width; )
+		{
+		    for (j = 0; j < src_image->bits.width; j++, i++)
+			((uint8_t*)extended_src)[i] = ((uint8_t*)src_line)[j];
+		}
+	    }
+
+	    info2.src_y = 0;
+	}
+	else
+	{
+	    info2.src_y = sy;
+	}
+
+	width_remain = width;
+
+	while (width_remain > 0)
+	{
+	    num_pixels = src_width - sx;
+
+	    if (num_pixels > width_remain)
+		num_pixels = width_remain;
+
+	    info2.src_x = sx;
+	    info2.width = num_pixels;
+	    info2.height = 1;
+
+	    func (imp, &info2);
+
+	    width_remain -= num_pixels;
+	    info2.mask_x += num_pixels;
+	    info2.dest_x += num_pixels;
+	    sx = 0;
+	}
+
+	sx = src_x;
+	sy++;
+	info2.mask_x = info->mask_x;
+	info2.mask_y++;
+	info2.dest_x = info->dest_x;
+	info2.dest_y++;
+    }
+
+    if (need_src_extension)
+	_pixman_image_fini (&extended_src_image);
+}
 
 /* Use more unrolling for src_0565_0565 because it is typically CPU bound */
 static force_inline void
-scaled_nearest_scanline_565_565_SRC (uint16_t *      dst,
-				     uint16_t *      src,
-				     int32_t         w,
-				     pixman_fixed_t  vx,
-				     pixman_fixed_t  unit_x,
-				     pixman_fixed_t  max_vx)
+scaled_nearest_scanline_565_565_SRC (uint16_t *       dst,
+				     const uint16_t * src,
+				     int32_t          w,
+				     pixman_fixed_t   vx,
+				     pixman_fixed_t   unit_x,
+				     pixman_fixed_t   max_vx,
+				     pixman_bool_t    fully_transparent_src)
 {
     uint16_t tmp1, tmp2, tmp3, tmp4;
     while ((w -= 4) >= 0)
     {
-	tmp1 = src[pixman_fixed_to_int (vx)];
+	tmp1 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
-	tmp2 = src[pixman_fixed_to_int (vx)];
+	tmp2 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
-	tmp3 = src[pixman_fixed_to_int (vx)];
+	tmp3 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
-	tmp4 = src[pixman_fixed_to_int (vx)];
+	tmp4 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
 	*dst++ = tmp1;
 	*dst++ = tmp2;
@@ -1432,26 +1395,26 @@ scaled_nearest_scanline_565_565_SRC (uint16_t *      dst,
     }
     if (w & 2)
     {
-	tmp1 = src[pixman_fixed_to_int (vx)];
+	tmp1 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
-	tmp2 = src[pixman_fixed_to_int (vx)];
+	tmp2 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
 	*dst++ = tmp1;
 	*dst++ = tmp2;
     }
     if (w & 1)
-	*dst++ = src[pixman_fixed_to_int (vx)];
+	*dst = *(src + pixman_fixed_to_int (vx));
 }
 
 FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
 		       scaled_nearest_scanline_565_565_SRC,
-		       uint16_t, uint16_t, COVER);
+		       uint16_t, uint16_t, COVER)
 FAST_NEAREST_MAINLOOP (565_565_none_SRC,
 		       scaled_nearest_scanline_565_565_SRC,
-		       uint16_t, uint16_t, NONE);
+		       uint16_t, uint16_t, NONE)
 FAST_NEAREST_MAINLOOP (565_565_pad_SRC,
 		       scaled_nearest_scanline_565_565_SRC,
-		       uint16_t, uint16_t, PAD);
+		       uint16_t, uint16_t, PAD)
 
 static force_inline uint32_t
 fetch_nearest (pixman_repeat_t src_repeat,
@@ -1460,7 +1423,7 @@ fetch_nearest (pixman_repeat_t src_repeat,
 {
     if (repeat (src_repeat, &x, src_width))
     {
-	if (format == PIXMAN_x8r8g8b8)
+	if (format == PIXMAN_x8r8g8b8 || format == PIXMAN_x8b8g8r8)
 	    return *(src + x) | 0xff000000;
 	else
 	    return *(src + x);
@@ -1493,19 +1456,9 @@ combine_src (uint32_t s, uint32_t *dst)
 
 static void
 fast_composite_scaled_nearest (pixman_implementation_t *imp,
-			       pixman_op_t              op,
-			       pixman_image_t *         src_image,
-			       pixman_image_t *         mask_image,
-			       pixman_image_t *         dst_image,
-			       int32_t                  src_x,
-			       int32_t                  src_y,
-			       int32_t                  mask_x,
-			       int32_t                  mask_y,
-			       int32_t                  dest_x,
-			       int32_t                  dest_y,
-			       int32_t                  width,
-			       int32_t                  height)
+			       pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t       *dst_line;
     uint32_t       *src_line;
     int             dst_stride, src_stride;
@@ -1516,7 +1469,7 @@ fast_composite_scaled_nearest (pixman_implementation_t *imp,
     pixman_vector_t v;
     pixman_fixed_t vy;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     /* pass in 0 instead of src_x and src_y because src_x and src_y need to be
      * transformed from destination space to source space
      */
@@ -1613,6 +1566,252 @@ fast_composite_scaled_nearest (pixman_implementation_t *imp,
     }
 }
 
+#define CACHE_LINE_SIZE 64
+
+#define FAST_SIMPLE_ROTATE(suffix, pix_type)                                  \
+                                                                              \
+static void                                                                   \
+blt_rotated_90_trivial_##suffix (pix_type       *dst,                         \
+				 int             dst_stride,                  \
+				 const pix_type *src,                         \
+				 int             src_stride,                  \
+				 int             w,                           \
+				 int             h)                           \
+{                                                                             \
+    int x, y;                                                                 \
+    for (y = 0; y < h; y++)                                                   \
+    {                                                                         \
+	const pix_type *s = src + (h - y - 1);                                \
+	pix_type *d = dst + dst_stride * y;                                   \
+	for (x = 0; x < w; x++)                                               \
+	{                                                                     \
+	    *d++ = *s;                                                        \
+	    s += src_stride;                                                  \
+	}                                                                     \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+blt_rotated_270_trivial_##suffix (pix_type       *dst,                        \
+				  int             dst_stride,                 \
+				  const pix_type *src,                        \
+				  int             src_stride,                 \
+				  int             w,                          \
+				  int             h)                          \
+{                                                                             \
+    int x, y;                                                                 \
+    for (y = 0; y < h; y++)                                                   \
+    {                                                                         \
+	const pix_type *s = src + src_stride * (w - 1) + y;                   \
+	pix_type *d = dst + dst_stride * y;                                   \
+	for (x = 0; x < w; x++)                                               \
+	{                                                                     \
+	    *d++ = *s;                                                        \
+	    s -= src_stride;                                                  \
+	}                                                                     \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+blt_rotated_90_##suffix (pix_type       *dst,                                 \
+			 int             dst_stride,                          \
+			 const pix_type *src,                                 \
+			 int             src_stride,                          \
+			 int             W,                                   \
+			 int             H)                                   \
+{                                                                             \
+    int x;                                                                    \
+    int leading_pixels = 0, trailing_pixels = 0;                              \
+    const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \
+                                                                              \
+    /*                                                                        \
+     * split processing into handling destination as TILE_SIZExH cache line   \
+     * aligned vertical stripes (optimistically assuming that destination     \
+     * stride is a multiple of cache line, if not - it will be just a bit     \
+     * slower)                                                                \
+     */                                                                       \
+                                                                              \
+    if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \
+    {                                                                         \
+	leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \
+			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+	if (leading_pixels > W)                                               \
+	    leading_pixels = W;                                               \
+                                                                              \
+	/* unaligned leading part NxH (where N < TILE_SIZE) */                \
+	blt_rotated_90_trivial_##suffix (                                     \
+	    dst,                                                              \
+	    dst_stride,                                                       \
+	    src,                                                              \
+	    src_stride,                                                       \
+	    leading_pixels,                                                   \
+	    H);                                                               \
+	                                                                      \
+	dst += leading_pixels;                                                \
+	src += leading_pixels * src_stride;                                   \
+	W -= leading_pixels;                                                  \
+    }                                                                         \
+                                                                              \
+    if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \
+    {                                                                         \
+	trailing_pixels = (((uintptr_t)(dst + W) &                            \
+			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+	if (trailing_pixels > W)                                              \
+	    trailing_pixels = W;                                              \
+	W -= trailing_pixels;                                                 \
+    }                                                                         \
+                                                                              \
+    for (x = 0; x < W; x += TILE_SIZE)                                        \
+    {                                                                         \
+	/* aligned middle part TILE_SIZExH */                                 \
+	blt_rotated_90_trivial_##suffix (                                     \
+	    dst + x,                                                          \
+	    dst_stride,                                                       \
+	    src + src_stride * x,                                             \
+	    src_stride,                                                       \
+	    TILE_SIZE,                                                        \
+	    H);                                                               \
+    }                                                                         \
+                                                                              \
+    if (trailing_pixels)                                                      \
+    {                                                                         \
+	/* unaligned trailing part NxH (where N < TILE_SIZE) */               \
+	blt_rotated_90_trivial_##suffix (                                     \
+	    dst + W,                                                          \
+	    dst_stride,                                                       \
+	    src + W * src_stride,                                             \
+	    src_stride,                                                       \
+	    trailing_pixels,                                                  \
+	    H);                                                               \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+blt_rotated_270_##suffix (pix_type       *dst,                                \
+			  int             dst_stride,                         \
+			  const pix_type *src,                                \
+			  int             src_stride,                         \
+			  int             W,                                  \
+			  int             H)                                  \
+{                                                                             \
+    int x;                                                                    \
+    int leading_pixels = 0, trailing_pixels = 0;                              \
+    const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \
+                                                                              \
+    /*                                                                        \
+     * split processing into handling destination as TILE_SIZExH cache line   \
+     * aligned vertical stripes (optimistically assuming that destination     \
+     * stride is a multiple of cache line, if not - it will be just a bit     \
+     * slower)                                                                \
+     */                                                                       \
+                                                                              \
+    if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \
+    {                                                                         \
+	leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \
+			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+	if (leading_pixels > W)                                               \
+	    leading_pixels = W;                                               \
+                                                                              \
+	/* unaligned leading part NxH (where N < TILE_SIZE) */                \
+	blt_rotated_270_trivial_##suffix (                                    \
+	    dst,                                                              \
+	    dst_stride,                                                       \
+	    src + src_stride * (W - leading_pixels),                          \
+	    src_stride,                                                       \
+	    leading_pixels,                                                   \
+	    H);                                                               \
+	                                                                      \
+	dst += leading_pixels;                                                \
+	W -= leading_pixels;                                                  \
+    }                                                                         \
+                                                                              \
+    if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \
+    {                                                                         \
+	trailing_pixels = (((uintptr_t)(dst + W) &                            \
+			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+	if (trailing_pixels > W)                                              \
+	    trailing_pixels = W;                                              \
+	W -= trailing_pixels;                                                 \
+	src += trailing_pixels * src_stride;                                  \
+    }                                                                         \
+                                                                              \
+    for (x = 0; x < W; x += TILE_SIZE)                                        \
+    {                                                                         \
+	/* aligned middle part TILE_SIZExH */                                 \
+	blt_rotated_270_trivial_##suffix (                                    \
+	    dst + x,                                                          \
+	    dst_stride,                                                       \
+	    src + src_stride * (W - x - TILE_SIZE),                           \
+	    src_stride,                                                       \
+	    TILE_SIZE,                                                        \
+	    H);                                                               \
+    }                                                                         \
+                                                                              \
+    if (trailing_pixels)                                                      \
+    {                                                                         \
+	/* unaligned trailing part NxH (where N < TILE_SIZE) */               \
+	blt_rotated_270_trivial_##suffix (                                    \
+	    dst + W,                                                          \
+	    dst_stride,                                                       \
+	    src - trailing_pixels * src_stride,                               \
+	    src_stride,                                                       \
+	    trailing_pixels,                                                  \
+	    H);                                                               \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+fast_composite_rotate_90_##suffix (pixman_implementation_t *imp,              \
+				   pixman_composite_info_t *info)	      \
+{									      \
+    PIXMAN_COMPOSITE_ARGS (info);					      \
+    pix_type       *dst_line;						      \
+    pix_type       *src_line;                                                 \
+    int             dst_stride, src_stride;                                   \
+    int             src_x_t, src_y_t;                                         \
+                                                                              \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, pix_type,              \
+			   dst_stride, dst_line, 1);                          \
+    src_x_t = -src_y + pixman_fixed_to_int (                                  \
+				src_image->common.transform->matrix[0][2] +   \
+				pixman_fixed_1 / 2 - pixman_fixed_e) - height;\
+    src_y_t = src_x + pixman_fixed_to_int (                                   \
+				src_image->common.transform->matrix[1][2] +   \
+				pixman_fixed_1 / 2 - pixman_fixed_e);         \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \
+			   src_stride, src_line, 1);                          \
+    blt_rotated_90_##suffix (dst_line, dst_stride, src_line, src_stride,      \
+			     width, height);                                  \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+fast_composite_rotate_270_##suffix (pixman_implementation_t *imp,             \
+				    pixman_composite_info_t *info)            \
+{                                                                             \
+    PIXMAN_COMPOSITE_ARGS (info);					      \
+    pix_type       *dst_line;						      \
+    pix_type       *src_line;                                                 \
+    int             dst_stride, src_stride;                                   \
+    int             src_x_t, src_y_t;                                         \
+                                                                              \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, pix_type,              \
+			   dst_stride, dst_line, 1);                          \
+    src_x_t = src_y + pixman_fixed_to_int (                                   \
+				src_image->common.transform->matrix[0][2] +   \
+				pixman_fixed_1 / 2 - pixman_fixed_e);         \
+    src_y_t = -src_x + pixman_fixed_to_int (                                  \
+				src_image->common.transform->matrix[1][2] +   \
+				pixman_fixed_1 / 2 - pixman_fixed_e) - width; \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \
+			   src_stride, src_line, 1);                          \
+    blt_rotated_270_##suffix (dst_line, dst_stride, src_line, src_stride,     \
+			      width, height);                                 \
+}
+
+FAST_SIMPLE_ROTATE (8, uint8_t)
+FAST_SIMPLE_ROTATE (565, uint16_t)
+FAST_SIMPLE_ROTATE (8888, uint32_t)
+
 static const pixman_fast_path_t c_fast_paths[] =
 {
     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, fast_composite_over_n_8_0565),
@@ -1645,16 +1844,19 @@ static const pixman_fast_path_t c_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, fast_composite_add_0565_0565),
+    PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, fast_composite_add_0565_0565),
     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888),
     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888),
     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8),
-    PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1000_1000),
+    PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1_1),
     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca),
     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8),
     PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, fast_composite_solid_fill),
     PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill),
     PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill),
     PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, fast_composite_solid_fill),
     PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill),
     PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill),
     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, fast_composite_src_x888_8888),
@@ -1675,10 +1877,6 @@ static const pixman_fast_path_t c_fast_paths[] =
     PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
     PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
     PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
-    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
-    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
-    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8),
     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8),
 
@@ -1695,6 +1893,13 @@ static const pixman_fast_path_t c_fast_paths[] =
 
     SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565),
 
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
+
     SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, 8888_8888),
     SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, 8888_8888),
     SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888),
@@ -1730,9 +1935,121 @@ static const pixman_fast_path_t c_fast_paths[] =
     NEAREST_FAST_PATH (OVER, x8b8g8r8, a8b8g8r8),
     NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8),
 
+#define SIMPLE_ROTATE_FLAGS(angle)					  \
+    (FAST_PATH_ROTATE_ ## angle ## _TRANSFORM	|			  \
+     FAST_PATH_NEAREST_FILTER			|			  \
+     FAST_PATH_SAMPLES_COVER_CLIP_NEAREST	|			  \
+     FAST_PATH_STANDARD_FLAGS)
+
+#define SIMPLE_ROTATE_FAST_PATH(op,s,d,suffix)				  \
+    {   PIXMAN_OP_ ## op,						  \
+	PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (90),				  \
+	PIXMAN_null, 0,							  \
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				  \
+	fast_composite_rotate_90_##suffix,				  \
+    },									  \
+    {   PIXMAN_OP_ ## op,						  \
+	PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (270),			  \
+	PIXMAN_null, 0,							  \
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				  \
+	fast_composite_rotate_270_##suffix,				  \
+    }
+
+    SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888),
+    SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888),
+    SIMPLE_ROTATE_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888),
+    SIMPLE_ROTATE_FAST_PATH (SRC, r5g6b5, r5g6b5, 565),
+    SIMPLE_ROTATE_FAST_PATH (SRC, a8, a8, 8),
+
+    /* Simple repeat fast path entry. */
+    {	PIXMAN_OP_any,
+	PIXMAN_any,
+	(FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | FAST_PATH_BITS_IMAGE |
+	 FAST_PATH_NORMAL_REPEAT),
+	PIXMAN_any, 0,
+	PIXMAN_any, FAST_PATH_STD_DEST_FLAGS,
+	fast_composite_tiled_repeat
+    },
+
     {   PIXMAN_OP_NONE	},
 };
 
+#ifdef WORDS_BIGENDIAN
+#define A1_FILL_MASK(n, offs) (((1U << (n)) - 1) << (32 - (offs) - (n)))
+#else
+#define A1_FILL_MASK(n, offs) (((1U << (n)) - 1) << (offs))
+#endif
+
+static force_inline void
+pixman_fill1_line (uint32_t *dst, int offs, int width, int v)
+{
+    if (offs)
+    {
+	int leading_pixels = 32 - offs;
+	if (leading_pixels >= width)
+	{
+	    if (v)
+		*dst |= A1_FILL_MASK (width, offs);
+	    else
+		*dst &= ~A1_FILL_MASK (width, offs);
+	    return;
+	}
+	else
+	{
+	    if (v)
+		*dst++ |= A1_FILL_MASK (leading_pixels, offs);
+	    else
+		*dst++ &= ~A1_FILL_MASK (leading_pixels, offs);
+	    width -= leading_pixels;
+	}
+    }
+    while (width >= 32)
+    {
+	if (v)
+	    *dst++ = 0xFFFFFFFF;
+	else
+	    *dst++ = 0;
+	width -= 32;
+    }
+    if (width > 0)
+    {
+	if (v)
+	    *dst |= A1_FILL_MASK (width, 0);
+	else
+	    *dst &= ~A1_FILL_MASK (width, 0);
+    }
+}
+
+static void
+pixman_fill1 (uint32_t *bits,
+              int       stride,
+              int       x,
+              int       y,
+              int       width,
+              int       height,
+              uint32_t  filler)
+{
+    uint32_t *dst = bits + y * stride + (x >> 5);
+    int offs = x & 31;
+
+    if (filler & 1)
+    {
+	while (height--)
+	{
+	    pixman_fill1_line (dst, offs, width, 1);
+	    dst += stride;
+	}
+    }
+    else
+    {
+	while (height--)
+	{
+	    pixman_fill1_line (dst, offs, width, 0);
+	    dst += stride;
+	}
+    }
+}
+
 static void
 pixman_fill8 (uint32_t *bits,
               int       stride,
@@ -1740,11 +2057,11 @@ pixman_fill8 (uint32_t *bits,
               int       y,
               int       width,
               int       height,
-              uint32_t xor)
+              uint32_t  filler)
 {
     int byte_stride = stride * (int) sizeof (uint32_t);
     uint8_t *dst = (uint8_t *) bits;
-    uint8_t v = xor & 0xff;
+    uint8_t v = filler & 0xff;
     int i;
 
     dst = dst + y * byte_stride + x;
@@ -1765,12 +2082,12 @@ pixman_fill16 (uint32_t *bits,
                int       y,
                int       width,
                int       height,
-               uint32_t xor)
+               uint32_t  filler)
 {
     int short_stride =
 	(stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t);
     uint16_t *dst = (uint16_t *)bits;
-    uint16_t v = xor & 0xffff;
+    uint16_t v = filler & 0xffff;
     int i;
 
     dst = dst + y * short_stride + x;
@@ -1791,7 +2108,7 @@ pixman_fill32 (uint32_t *bits,
                int       y,
                int       width,
                int       height,
-               uint32_t  xor)
+               uint32_t  filler)
 {
     int i;
 
@@ -1800,7 +2117,7 @@ pixman_fill32 (uint32_t *bits,
     while (height--)
     {
 	for (i = 0; i < width; ++i)
-	    bits[i] = xor;
+	    bits[i] = filler;
 
 	bits += stride;
     }
@@ -1815,38 +2132,227 @@ fast_path_fill (pixman_implementation_t *imp,
                 int                      y,
                 int                      width,
                 int                      height,
-                uint32_t		 xor)
+                uint32_t		 filler)
 {
     switch (bpp)
     {
+    case 1:
+	pixman_fill1 (bits, stride, x, y, width, height, filler);
+	break;
+
     case 8:
-	pixman_fill8 (bits, stride, x, y, width, height, xor);
+	pixman_fill8 (bits, stride, x, y, width, height, filler);
 	break;
 
     case 16:
-	pixman_fill16 (bits, stride, x, y, width, height, xor);
+	pixman_fill16 (bits, stride, x, y, width, height, filler);
 	break;
 
     case 32:
-	pixman_fill32 (bits, stride, x, y, width, height, xor);
+	pixman_fill32 (bits, stride, x, y, width, height, filler);
 	break;
 
     default:
-	return _pixman_implementation_fill (
-	    imp->delegate, bits, stride, bpp, x, y, width, height, xor);
-	break;
+	return FALSE;
     }
 
     return TRUE;
 }
 
-pixman_implementation_t *
-_pixman_implementation_create_fast_path (void)
+/*****************************************************************************/
+
+static uint32_t *
+fast_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
 {
-    pixman_implementation_t *general = _pixman_implementation_create_general ();
-    pixman_implementation_t *imp = _pixman_implementation_create (general, c_fast_paths);
+    int32_t w = iter->width;
+    uint32_t *dst = iter->buffer;
+    const uint16_t *src = (const uint16_t *)iter->bits;
+
+    iter->bits += iter->stride;
+
+    /* Align the source buffer at 4 bytes boundary */
+    if (w > 0 && ((uintptr_t)src & 3))
+    {
+	*dst++ = convert_0565_to_8888 (*src++);
+	w--;
+    }
+    /* Process two pixels per iteration */
+    while ((w -= 2) >= 0)
+    {
+	uint32_t sr, sb, sg, t0, t1;
+	uint32_t s = *(const uint32_t *)src;
+	src += 2;
+	sr = (s >> 8) & 0x00F800F8;
+	sb = (s << 3) & 0x00F800F8;
+	sg = (s >> 3) & 0x00FC00FC;
+	sr |= sr >> 5;
+	sb |= sb >> 5;
+	sg |= sg >> 6;
+	t0 = ((sr << 16) & 0x00FF0000) | ((sg << 8) & 0x0000FF00) |
+	     (sb & 0xFF) | 0xFF000000;
+	t1 = (sr & 0x00FF0000) | ((sg >> 8) & 0x0000FF00) |
+	     (sb >> 16) | 0xFF000000;
+#ifdef WORDS_BIGENDIAN
+	*dst++ = t1;
+	*dst++ = t0;
+#else
+	*dst++ = t0;
+	*dst++ = t1;
+#endif
+    }
+    if (w & 1)
+    {
+	*dst = convert_0565_to_8888 (*src);
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
+fast_dest_fetch_noop (pixman_iter_t *iter, const uint32_t *mask)
+{
+    iter->bits += iter->stride;
+    return iter->buffer;
+}
+
+/* Helper function for a workaround, which tries to ensure that 0x1F001F
+ * constant is always allocated in a register on RISC architectures.
+ */
+static force_inline uint32_t
+convert_8888_to_0565_workaround (uint32_t s, uint32_t x1F001F)
+{
+    uint32_t a, b;
+    a = (s >> 3) & x1F001F;
+    b = s & 0xFC00;
+    a |= a >> 5;
+    a |= b >> 5;
+    return a;
+}
+
+static void
+fast_write_back_r5g6b5 (pixman_iter_t *iter)
+{
+    int32_t w = iter->width;
+    uint16_t *dst = (uint16_t *)(iter->bits - iter->stride);
+    const uint32_t *src = iter->buffer;
+    /* Workaround to ensure that x1F001F variable is allocated in a register */
+    static volatile uint32_t volatile_x1F001F = 0x1F001F;
+    uint32_t x1F001F = volatile_x1F001F;
+
+    while ((w -= 4) >= 0)
+    {
+	uint32_t s1 = *src++;
+	uint32_t s2 = *src++;
+	uint32_t s3 = *src++;
+	uint32_t s4 = *src++;
+	*dst++ = convert_8888_to_0565_workaround (s1, x1F001F);
+	*dst++ = convert_8888_to_0565_workaround (s2, x1F001F);
+	*dst++ = convert_8888_to_0565_workaround (s3, x1F001F);
+	*dst++ = convert_8888_to_0565_workaround (s4, x1F001F);
+    }
+    if (w & 2)
+    {
+	*dst++ = convert_8888_to_0565_workaround (*src++, x1F001F);
+	*dst++ = convert_8888_to_0565_workaround (*src++, x1F001F);
+    }
+    if (w & 1)
+    {
+	*dst = convert_8888_to_0565_workaround (*src, x1F001F);
+    }
+}
+
+typedef struct
+{
+    pixman_format_code_t	format;
+    pixman_iter_get_scanline_t	get_scanline;
+    pixman_iter_write_back_t	write_back;
+} fetcher_info_t;
+
+static const fetcher_info_t fetchers[] =
+{
+    { PIXMAN_r5g6b5, fast_fetch_r5g6b5, fast_write_back_r5g6b5 },
+    { PIXMAN_null }
+};
+
+static pixman_bool_t
+fast_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+
+#define FLAGS								\
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
+     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
+
+    if ((iter->iter_flags & ITER_NARROW)			&&
+	(iter->image_flags & FLAGS) == FLAGS)
+    {
+	const fetcher_info_t *f;
+
+	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
+	{
+	    if (image->common.extended_format_code == f->format)
+	    {
+		uint8_t *b = (uint8_t *)image->bits.bits;
+		int s = image->bits.rowstride * 4;
+
+		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
+		iter->stride = s;
+
+		iter->get_scanline = f->get_scanline;
+		return TRUE;
+	    }
+	}
+    }
+
+    return FALSE;
+}
+
+static pixman_bool_t
+fast_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+
+    if ((iter->iter_flags & ITER_NARROW)		&&
+	(iter->image_flags & FAST_PATH_STD_DEST_FLAGS) == FAST_PATH_STD_DEST_FLAGS)
+    {
+	const fetcher_info_t *f;
+
+	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
+	{
+	    if (image->common.extended_format_code == f->format)
+	    {
+		uint8_t *b = (uint8_t *)image->bits.bits;
+		int s = image->bits.rowstride * 4;
+
+		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
+		iter->stride = s;
+
+		if ((iter->iter_flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) ==
+		    (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA))
+		{
+		    iter->get_scanline = fast_dest_fetch_noop;
+		}
+		else
+		{
+		    iter->get_scanline = f->get_scanline;
+		}
+		iter->write_back = f->write_back;
+		return TRUE;
+	    }
+	}
+    }
+    return FALSE;
+}
+
+
+pixman_implementation_t *
+_pixman_implementation_create_fast_path (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths);
 
     imp->fill = fast_path_fill;
+    imp->src_iter_init = fast_src_iter_init;
+    imp->dest_iter_init = fast_dest_iter_init;
 
     return imp;
 }
diff --git a/programs/develop/libraries/pixman/pixman-fast-path.h b/programs/develop/libraries/pixman/pixman-fast-path.h
deleted file mode 100644
index 98ef81ea7d..0000000000
--- a/programs/develop/libraries/pixman/pixman-fast-path.h
+++ /dev/null
@@ -1,451 +0,0 @@
-/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
-/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of SuSE not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission.  SuSE makes no representations about the
- * suitability of this software for any purpose.  It is provided "as is"
- * without express or implied warranty.
- *
- * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
- * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- * Author:  Keith Packard, SuSE, Inc.
- */
-
-#ifndef PIXMAN_FAST_PATH_H__
-#define PIXMAN_FAST_PATH_H__
-
-#include "pixman-private.h"
-
-#define PIXMAN_REPEAT_COVER -1
-
-static force_inline pixman_bool_t
-repeat (pixman_repeat_t repeat, int *c, int size)
-{
-    if (repeat == PIXMAN_REPEAT_NONE)
-    {
-	if (*c < 0 || *c >= size)
-	    return FALSE;
-    }
-    else if (repeat == PIXMAN_REPEAT_NORMAL)
-    {
-	while (*c >= size)
-	    *c -= size;
-	while (*c < 0)
-	    *c += size;
-    }
-    else if (repeat == PIXMAN_REPEAT_PAD)
-    {
-	*c = CLIP (*c, 0, size - 1);
-    }
-    else /* REFLECT */
-    {
-	*c = MOD (*c, size * 2);
-	if (*c >= size)
-	    *c = size * 2 - *c - 1;
-    }
-    return TRUE;
-}
-
-/*
- * For each scanline fetched from source image with PAD repeat:
- * - calculate how many pixels need to be padded on the left side
- * - calculate how many pixels need to be padded on the right side
- * - update width to only count pixels which are fetched from the image
- * All this information is returned via 'width', 'left_pad', 'right_pad'
- * arguments. The code is assuming that 'unit_x' is positive.
- *
- * Note: 64-bit math is used in order to avoid potential overflows, which
- *       is probably excessive in many cases. This particular function
- *       may need its own correctness test and performance tuning.
- */
-static force_inline void
-pad_repeat_get_scanline_bounds (int32_t         source_image_width,
-				pixman_fixed_t  vx,
-				pixman_fixed_t  unit_x,
-				int32_t *       width,
-				int32_t *       left_pad,
-				int32_t *       right_pad)
-{
-    int64_t max_vx = (int64_t) source_image_width << 16;
-    int64_t tmp;
-    if (vx < 0)
-    {
-	tmp = ((int64_t) unit_x - 1 - vx) / unit_x;
-	if (tmp > *width)
-	{
-	    *left_pad = *width;
-	    *width = 0;
-	}
-	else
-	{
-	    *left_pad = (int32_t) tmp;
-	    *width -= (int32_t) tmp;
-	}
-    }
-    else
-    {
-	*left_pad = 0;
-    }
-    tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad;
-    if (tmp < 0)
-    {
-	*right_pad = *width;
-	*width = 0;
-    }
-    else if (tmp >= *width)
-    {
-	*right_pad = 0;
-    }
-    else
-    {
-	*right_pad = *width - (int32_t) tmp;
-	*width = (int32_t) tmp;
-    }
-}
-
-/* A macroified version of specialized nearest scalers for some
- * common 8888 and 565 formats. It supports SRC and OVER ops.
- *
- * There are two repeat versions, one that handles repeat normal,
- * and one without repeat handling that only works if the src region
- * used is completely covered by the pre-repeated source samples.
- *
- * The loops are unrolled to process two pixels per iteration for better
- * performance on most CPU architectures (superscalar processors
- * can issue several operations simultaneously, other processors can hide
- * instructions latencies by pipelining operations). Unrolling more
- * does not make much sense because the compiler will start running out
- * of spare registers soon.
- */
-
-#define GET_8888_ALPHA(s) ((s) >> 24)
- /* This is not actually used since we don't have an OVER with
-    565 source, but it is needed to build. */
-#define GET_0565_ALPHA(s) 0xff
-
-#define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT,			\
-			      src_type_t, dst_type_t, OP, repeat_mode)				\
-static force_inline void									\
-scanline_func_name (dst_type_t     *dst,							\
-		    src_type_t     *src,							\
-		    int32_t         w,								\
-		    pixman_fixed_t  vx,								\
-		    pixman_fixed_t  unit_x,							\
-		    pixman_fixed_t  max_vx)							\
-{												\
-	uint32_t   d;										\
-	src_type_t s1, s2;									\
-	uint8_t    a1, a2;									\
-	int        x1, x2;									\
-												\
-	if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER)		\
-	    abort();										\
-												\
-	while ((w -= 2) >= 0)									\
-	{											\
-	    x1 = vx >> 16;									\
-	    vx += unit_x;									\
-	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
-	    {											\
-		/* This works because we know that unit_x is positive */			\
-		while (vx >= max_vx)								\
-		    vx -= max_vx;								\
-	    }											\
-	    s1 = src[x1];									\
-												\
-	    x2 = vx >> 16;									\
-	    vx += unit_x;									\
-	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
-	    {											\
-		/* This works because we know that unit_x is positive */			\
-		while (vx >= max_vx)								\
-		    vx -= max_vx;								\
-	    }											\
-	    s2 = src[x2];									\
-												\
-	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
-	    {											\
-		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
-		a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2);						\
-												\
-		if (a1 == 0xff)									\
-		{										\
-		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
-		}										\
-		else if (s1)									\
-		{										\
-		    d = CONVERT_ ## DST_FORMAT ## _TO_8888 (*dst);				\
-		    s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1);				\
-		    a1 ^= 0xff;									\
-		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
-		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
-		}										\
-		dst++;										\
-												\
-		if (a2 == 0xff)									\
-		{										\
-		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
-		}										\
-		else if (s2)									\
-		{										\
-		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
-		    s2 = CONVERT_## SRC_FORMAT ## _TO_8888 (s2);				\
-		    a2 ^= 0xff;									\
-		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2);					\
-		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
-		}										\
-		dst++;										\
-	    }											\
-	    else /* PIXMAN_OP_SRC */								\
-	    {											\
-		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
-		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
-	    }											\
-	}											\
-												\
-	if (w & 1)										\
-	{											\
-	    x1 = vx >> 16;									\
-	    s1 = src[x1];									\
-												\
-	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
-	    {											\
-		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
-												\
-		if (a1 == 0xff)									\
-		{										\
-		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
-		}										\
-		else if (s1)									\
-		{										\
-		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
-		    s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1);				\
-		    a1 ^= 0xff;									\
-		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
-		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
-		}										\
-		dst++;										\
-	    }											\
-	    else /* PIXMAN_OP_SRC */								\
-	    {											\
-		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
-	    }											\
-	}											\
-}
-
-#define FAST_NEAREST_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, dst_type_t,	\
-				  repeat_mode)							\
-static void											\
-fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,		\
-						   pixman_op_t              op,			\
-						   pixman_image_t *         src_image,		\
-						   pixman_image_t *         mask_image,		\
-						   pixman_image_t *         dst_image,		\
-						   int32_t                  src_x,		\
-						   int32_t                  src_y,		\
-						   int32_t                  mask_x,		\
-						   int32_t                  mask_y,		\
-						   int32_t                  dst_x,		\
-						   int32_t                  dst_y,		\
-						   int32_t                  width,		\
-						   int32_t                  height)		\
-{												\
-    dst_type_t *dst_line;									\
-    src_type_t *src_first_line;									\
-    int       y;										\
-    pixman_fixed_t max_vx = max_vx; /* suppress uninitialized variable warning */		\
-    pixman_fixed_t max_vy;									\
-    pixman_vector_t v;										\
-    pixman_fixed_t vx, vy;									\
-    pixman_fixed_t unit_x, unit_y;								\
-    int32_t left_pad, right_pad;								\
-												\
-    src_type_t *src;										\
-    dst_type_t *dst;										\
-    int       src_stride, dst_stride;								\
-												\
-    PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1);	\
-    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
-     * transformed from destination space to source space */					\
-    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
-												\
-    /* reference point is the center of the pixel */						\
-    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
-    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
-    v.vector[2] = pixman_fixed_1;								\
-												\
-    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
-	return;											\
-												\
-    unit_x = src_image->common.transform->matrix[0][0];						\
-    unit_y = src_image->common.transform->matrix[1][1];						\
-												\
-    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */			\
-    v.vector[0] -= pixman_fixed_e;								\
-    v.vector[1] -= pixman_fixed_e;								\
-												\
-    vx = v.vector[0];										\
-    vy = v.vector[1];										\
-												\
-    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
-    {												\
-	/* Clamp repeating positions inside the actual samples */				\
-	max_vx = src_image->bits.width << 16;							\
-	max_vy = src_image->bits.height << 16;							\
-												\
-	repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);						\
-	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
-    }												\
-												\
-    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
-	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
-    {												\
-	pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x,			\
-					&width, &left_pad, &right_pad);				\
-	vx += left_pad * unit_x;								\
-    }												\
-												\
-    while (--height >= 0)									\
-    {												\
-	dst = dst_line;										\
-	dst_line += dst_stride;									\
-												\
-	y = vy >> 16;										\
-	vy += unit_y;										\
-	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
-	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
-	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
-	{											\
-	    repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height);				\
-	    src = src_first_line + src_stride * y;						\
-	    if (left_pad > 0)									\
-	    {											\
-		scanline_func (dst, src, left_pad, 0, 0, 0);					\
-	    }											\
-	    if (width > 0)									\
-	    {											\
-		scanline_func (dst + left_pad, src, width, vx, unit_x, 0);			\
-	    }											\
-	    if (right_pad > 0)									\
-	    {											\
-		scanline_func (dst + left_pad + width, src + src_image->bits.width - 1,		\
-			        right_pad, 0, 0, 0);						\
-	    }											\
-	}											\
-	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
-	{											\
-	    static src_type_t zero = 0;								\
-	    if (y < 0 || y >= src_image->bits.height)						\
-	    {											\
-		scanline_func (dst, &zero, left_pad + width + right_pad, 0, 0, 0);		\
-		continue;									\
-	    }											\
-	    src = src_first_line + src_stride * y;						\
-	    if (left_pad > 0)									\
-	    {											\
-		scanline_func (dst, &zero, left_pad, 0, 0, 0);					\
-	    }											\
-	    if (width > 0)									\
-	    {											\
-		scanline_func (dst + left_pad, src, width, vx, unit_x, 0);			\
-	    }											\
-	    if (right_pad > 0)									\
-	    {											\
-		scanline_func (dst + left_pad + width, &zero, right_pad, 0, 0, 0);		\
-	    }											\
-	}											\
-	else											\
-	{											\
-	    src = src_first_line + src_stride * y;						\
-	    scanline_func (dst, src, width, vx, unit_x, max_vx);				\
-	}											\
-    }												\
-}
-
-/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
-#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t,		\
-			      repeat_mode)							\
-	FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, dst_type_t,	\
-			      repeat_mode)							\
-
-#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT,				\
-		     src_type_t, dst_type_t, OP, repeat_mode)				\
-    FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
-			  SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t,		\
-			  OP, repeat_mode)						\
-    FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name ## _ ## OP,				\
-			  scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
-			  src_type_t, dst_type_t, repeat_mode)				\
-											\
-    extern int no_such_variable
-
-
-#define SCALED_NEAREST_FLAGS						\
-    (FAST_PATH_SCALE_TRANSFORM	|					\
-     FAST_PATH_NO_ALPHA_MAP	|					\
-     FAST_PATH_NEAREST_FILTER	|					\
-     FAST_PATH_NO_ACCESSORS	|					\
-     FAST_PATH_NARROW_FORMAT)
-
-#define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func)			\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_NORMAL_REPEAT	|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func)			\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_PAD_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func)			\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_NONE_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func)			\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
-    }
-
-/* Prefer the use of 'cover' variant, because it is faster */
-#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\
-    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),			\
-    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),			\
-    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),				\
-    SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
-
-#endif
diff --git a/programs/develop/libraries/pixman/pixman-filter.c b/programs/develop/libraries/pixman/pixman-filter.c
new file mode 100644
index 0000000000..5ff7b6eaad
--- /dev/null
+++ b/programs/develop/libraries/pixman/pixman-filter.c
@@ -0,0 +1,350 @@
+/*
+ * Copyright 2012, Red Hat, Inc.
+ * Copyright 2012, Soren Sandmann
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Soren Sandmann <soren.sandmann@gmail.com>
+ */
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <assert.h>
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include "pixman-private.h"
+
+typedef double (* kernel_func_t) (double x);
+
+typedef struct
+{
+    pixman_kernel_t	kernel;
+    kernel_func_t	func;
+    double		width;
+} filter_info_t;
+
+static double
+impulse_kernel (double x)
+{
+    return (x == 0.0)? 1.0 : 0.0;
+}
+
+static double
+box_kernel (double x)
+{
+    return 1;
+}
+
+static double
+linear_kernel (double x)
+{
+    return 1 - fabs (x);
+}
+
+static double
+gaussian_kernel (double x)
+{
+#define SQRT2 (1.4142135623730950488016887242096980785696718753769480)
+#define SIGMA (SQRT2 / 2.0)
+    
+    return exp (- x * x / (2 * SIGMA * SIGMA)) / (SIGMA * sqrt (2.0 * M_PI));
+}
+
+static double
+sinc (double x)
+{
+    if (x == 0.0)
+	return 1.0;
+    else
+	return sin (M_PI * x) / (M_PI * x);
+}
+
+static double
+lanczos (double x, int n)
+{
+    return sinc (x) * sinc (x * (1.0 / n));
+}
+
+static double
+lanczos2_kernel (double x)
+{
+    return lanczos (x, 2);
+}
+
+static double
+lanczos3_kernel (double x)
+{
+    return lanczos (x, 3);
+}
+
+static double
+nice_kernel (double x)
+{
+    return lanczos3_kernel (x * 0.75);
+}
+
+static double
+general_cubic (double x, double B, double C)
+{
+    double ax = fabs(x);
+
+    if (ax < 1)
+    {
+	return ((12 - 9 * B - 6 * C) * ax * ax * ax +
+		(-18 + 12 * B + 6 * C) * ax * ax + (6 - 2 * B)) / 6;
+    }
+    else if (ax >= 1 && ax < 2)
+    {
+	return ((-B - 6 * C) * ax * ax * ax +
+		(6 * B + 30 * C) * ax * ax + (-12 * B - 48 * C) *
+		ax + (8 * B + 24 * C)) / 6;
+    }
+    else
+    {
+	return 0;
+    }
+}
+
+static double
+cubic_kernel (double x)
+{
+    /* This is the Mitchell-Netravali filter.
+     *
+     * (0.0, 0.5) would give us the Catmull-Rom spline,
+     * but that one seems to be indistinguishable from Lanczos2.
+     */
+    return general_cubic (x, 1/3.0, 1/3.0);
+}
+
+static const filter_info_t filters[] =
+{
+    { PIXMAN_KERNEL_IMPULSE,	        impulse_kernel,   0.0 },
+    { PIXMAN_KERNEL_BOX,	        box_kernel,       1.0 },
+    { PIXMAN_KERNEL_LINEAR,	        linear_kernel,    2.0 },
+    { PIXMAN_KERNEL_CUBIC,		cubic_kernel,     4.0 },
+    { PIXMAN_KERNEL_GAUSSIAN,	        gaussian_kernel,  6 * SIGMA },
+    { PIXMAN_KERNEL_LANCZOS2,	        lanczos2_kernel,  4.0 },
+    { PIXMAN_KERNEL_LANCZOS3,	        lanczos3_kernel,  6.0 },
+    { PIXMAN_KERNEL_LANCZOS3_STRETCHED, nice_kernel,      8.0 },
+};
+
+/* This function scales @kernel2 by @scale, then
+ * aligns @x1 in @kernel1 with @x2 in @kernel2 and
+ * and integrates the product of the kernels across @width.
+ *
+ * This function assumes that the intervals are within
+ * the kernels in question. E.g., the caller must not
+ * try to integrate a linear kernel ouside of [-1:1]
+ */
+static double
+integral (pixman_kernel_t kernel1, double x1,
+	  pixman_kernel_t kernel2, double scale, double x2,
+	  double width)
+{
+    /* If the integration interval crosses zero, break it into
+     * two separate integrals. This ensures that filters such
+     * as LINEAR that are not differentiable at 0 will still
+     * integrate properly.
+     */
+    if (x1 < 0 && x1 + width > 0)
+    {
+	return
+	    integral (kernel1, x1, kernel2, scale, x2, - x1) +
+	    integral (kernel1, 0, kernel2, scale, x2 - x1, width + x1);
+    }
+    else if (x2 < 0 && x2 + width > 0)
+    {
+	return
+	    integral (kernel1, x1, kernel2, scale, x2, - x2) +
+	    integral (kernel1, x1 - x2, kernel2, scale, 0, width + x2);
+    }
+    else if (kernel1 == PIXMAN_KERNEL_IMPULSE)
+    {
+	assert (width == 0.0);
+	return filters[kernel2].func (x2 * scale);
+    }
+    else if (kernel2 == PIXMAN_KERNEL_IMPULSE)
+    {
+	assert (width == 0.0);
+	return filters[kernel1].func (x1);
+    }
+    else
+    {
+	/* Integration via Simpson's rule */
+#define N_SEGMENTS 128
+#define SAMPLE(a1, a2)							\
+	(filters[kernel1].func ((a1)) * filters[kernel2].func ((a2) * scale))
+	
+	double s = 0.0;
+	double h = width / (double)N_SEGMENTS;
+	int i;
+
+	s = SAMPLE (x1, x2);
+
+	for (i = 1; i < N_SEGMENTS; i += 2)
+	{
+	    double a1 = x1 + h * i;
+	    double a2 = x2 + h * i;
+
+	    s += 2 * SAMPLE (a1, a2);
+
+	    if (i >= 2 && i < N_SEGMENTS - 1)
+		s += 4 * SAMPLE (a1, a2);
+	}
+
+	s += SAMPLE (x1 + width, x2 + width);
+	
+	return h * s * (1.0 / 3.0);
+    }
+}
+
+static pixman_fixed_t *
+create_1d_filter (int             *width,
+		  pixman_kernel_t  reconstruct,
+		  pixman_kernel_t  sample,
+		  double           scale,
+		  int              n_phases)
+{
+    pixman_fixed_t *params, *p;
+    double step;
+    double size;
+    int i;
+
+    size = scale * filters[sample].width + filters[reconstruct].width;
+    *width = ceil (size);
+
+    p = params = malloc (*width * n_phases * sizeof (pixman_fixed_t));
+    if (!params)
+        return NULL;
+
+    step = 1.0 / n_phases;
+
+    for (i = 0; i < n_phases; ++i)
+    {
+        double frac = step / 2.0 + i * step;
+	pixman_fixed_t new_total;
+        int x, x1, x2;
+	double total;
+
+	/* Sample convolution of reconstruction and sampling
+	 * filter. See rounding.txt regarding the rounding
+	 * and sample positions.
+	 */
+
+	x1 = ceil (frac - *width / 2.0 - 0.5);
+        x2 = x1 + *width;
+
+	total = 0;
+        for (x = x1; x < x2; ++x)
+        {
+	    double pos = x + 0.5 - frac;
+	    double rlow = - filters[reconstruct].width / 2.0;
+	    double rhigh = rlow + filters[reconstruct].width;
+	    double slow = pos - scale * filters[sample].width / 2.0;
+	    double shigh = slow + scale * filters[sample].width;
+	    double c = 0.0;
+	    double ilow, ihigh;
+
+	    if (rhigh >= slow && rlow <= shigh)
+	    {
+		ilow = MAX (slow, rlow);
+		ihigh = MIN (shigh, rhigh);
+
+		c = integral (reconstruct, ilow,
+			      sample, 1.0 / scale, ilow - pos,
+			      ihigh - ilow);
+	    }
+
+	    total += c;
+            *p++ = (pixman_fixed_t)(c * 65535.0 + 0.5);
+        }
+
+	/* Normalize */
+	p -= *width;
+        total = 1 / total;
+        new_total = 0;
+	for (x = x1; x < x2; ++x)
+	{
+	    pixman_fixed_t t = (*p) * total + 0.5;
+
+	    new_total += t;
+	    *p++ = t;
+	}
+
+	if (new_total != pixman_fixed_1)
+	    *(p - *width / 2) += (pixman_fixed_1 - new_total);
+    }
+
+    return params;
+}
+
+/* Create the parameter list for a SEPARABLE_CONVOLUTION filter
+ * with the given kernels and scale parameters
+ */
+PIXMAN_EXPORT pixman_fixed_t *
+pixman_filter_create_separable_convolution (int             *n_values,
+					    pixman_fixed_t   scale_x,
+					    pixman_fixed_t   scale_y,
+					    pixman_kernel_t  reconstruct_x,
+					    pixman_kernel_t  reconstruct_y,
+					    pixman_kernel_t  sample_x,
+					    pixman_kernel_t  sample_y,
+					    int              subsample_bits_x,
+					    int	             subsample_bits_y)
+{
+    double sx = fabs (pixman_fixed_to_double (scale_x));
+    double sy = fabs (pixman_fixed_to_double (scale_y));
+    pixman_fixed_t *horz = NULL, *vert = NULL, *params = NULL;
+    int subsample_x, subsample_y;
+    int width, height;
+
+    subsample_x = (1 << subsample_bits_x);
+    subsample_y = (1 << subsample_bits_y);
+
+    horz = create_1d_filter (&width, reconstruct_x, sample_x, sx, subsample_x);
+    vert = create_1d_filter (&height, reconstruct_y, sample_y, sy, subsample_y);
+
+    if (!horz || !vert)
+        goto out;
+    
+    *n_values = 4 + width * subsample_x + height * subsample_y;
+    
+    params = malloc (*n_values * sizeof (pixman_fixed_t));
+    if (!params)
+        goto out;
+
+    params[0] = pixman_int_to_fixed (width);
+    params[1] = pixman_int_to_fixed (height);
+    params[2] = pixman_int_to_fixed (subsample_bits_x);
+    params[3] = pixman_int_to_fixed (subsample_bits_y);
+
+    memcpy (params + 4, horz,
+	    width * subsample_x * sizeof (pixman_fixed_t));
+    memcpy (params + 4 + width * subsample_x, vert,
+	    height * subsample_y * sizeof (pixman_fixed_t));
+
+out:
+    free (horz);
+    free (vert);
+
+    return params;
+}
diff --git a/programs/develop/libraries/pixman/pixman-general.c b/programs/develop/libraries/pixman/pixman-general.c
index 8130f166ef..93a1b9acfa 100644
--- a/programs/develop/libraries/pixman/pixman-general.c
+++ b/programs/develop/libraries/pixman/pixman-general.c
@@ -36,44 +36,102 @@
 #include <stdlib.h>
 #include <string.h>
 #include "pixman-private.h"
-#include "pixman-combine32.h"
-#include "pixman-private.h"
+
+static pixman_bool_t
+general_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+
+    if (image->type == LINEAR)
+	_pixman_linear_gradient_iter_init (image, iter);
+    else if (image->type == RADIAL)
+	_pixman_radial_gradient_iter_init (image, iter);
+    else if (image->type == CONICAL)
+	_pixman_conical_gradient_iter_init (image, iter);
+    else if (image->type == BITS)
+	_pixman_bits_image_src_iter_init (image, iter);
+    else if (image->type == SOLID)
+        _pixman_log_error (FUNC, "Solid image not handled by noop");
+    else         
+	_pixman_log_error (FUNC, "Pixman bug: unknown image type\n");
+
+    return TRUE;
+}
+
+static pixman_bool_t
+general_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    if (iter->image->type == BITS)
+    {
+	_pixman_bits_image_dest_iter_init (iter->image, iter);
+
+	return TRUE;
+    }
+    else
+    {
+	_pixman_log_error (FUNC, "Trying to write to a non-writable image");
+
+	return FALSE;
+    }
+}
+
+typedef struct op_info_t op_info_t;
+struct op_info_t
+{
+    uint8_t src, dst;
+};
+
+#define ITER_IGNORE_BOTH						\
+    (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB | ITER_LOCALIZED_ALPHA)
+
+static const op_info_t op_flags[PIXMAN_N_OPERATORS] =
+{
+    /* Src                   Dst                   */
+    { ITER_IGNORE_BOTH,      ITER_IGNORE_BOTH      }, /* CLEAR */
+    { ITER_LOCALIZED_ALPHA,  ITER_IGNORE_BOTH      }, /* SRC */
+    { ITER_IGNORE_BOTH,      ITER_LOCALIZED_ALPHA  }, /* DST */
+    { 0,                     ITER_LOCALIZED_ALPHA  }, /* OVER */
+    { ITER_LOCALIZED_ALPHA,  0                     }, /* OVER_REVERSE */
+    { ITER_LOCALIZED_ALPHA,  ITER_IGNORE_RGB       }, /* IN */
+    { ITER_IGNORE_RGB,       ITER_LOCALIZED_ALPHA  }, /* IN_REVERSE */
+    { ITER_LOCALIZED_ALPHA,  ITER_IGNORE_RGB       }, /* OUT */
+    { ITER_IGNORE_RGB,       ITER_LOCALIZED_ALPHA  }, /* OUT_REVERSE */
+    { 0,                     0                     }, /* ATOP */
+    { 0,                     0                     }, /* ATOP_REVERSE */
+    { 0,                     0                     }, /* XOR */
+    { ITER_LOCALIZED_ALPHA,  ITER_LOCALIZED_ALPHA  }, /* ADD */
+    { 0,                     0                     }, /* SATURATE */
+};
 
 #define SCANLINE_BUFFER_LENGTH 8192
 
 static void
 general_composite_rect  (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         pixman_image_t *         src,
-                         pixman_image_t *         mask,
-                         pixman_image_t *         dest,
-                         int32_t                  src_x,
-                         int32_t                  src_y,
-                         int32_t                  mask_x,
-                         int32_t                  mask_y,
-                         int32_t                  dest_x,
-                         int32_t                  dest_y,
-                         int32_t                  width,
-                         int32_t                  height)
+                         pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint64_t stack_scanline_buffer[(SCANLINE_BUFFER_LENGTH * 3 + 7) / 8];
     uint8_t *scanline_buffer = (uint8_t *) stack_scanline_buffer;
     uint8_t *src_buffer, *mask_buffer, *dest_buffer;
-    fetch_scanline_t fetch_src = NULL, fetch_mask = NULL, fetch_dest = NULL;
+    pixman_iter_t src_iter, mask_iter, dest_iter;
     pixman_combine_32_func_t compose;
-    store_scanline_t store;
-    source_image_class_t src_class, mask_class;
     pixman_bool_t component_alpha;
-    uint32_t *bits;
-    int32_t stride;
-    int narrow, Bpp;
+    iter_flags_t narrow, src_iter_flags;
+    int Bpp;
     int i;
 
-    narrow =
-	(src->common.flags & FAST_PATH_NARROW_FORMAT)		&&
-	(!mask || mask->common.flags & FAST_PATH_NARROW_FORMAT)	&&
-	(dest->common.flags & FAST_PATH_NARROW_FORMAT);
-    Bpp = narrow ? 4 : 8;
+    if ((src_image->common.flags & FAST_PATH_NARROW_FORMAT)		    &&
+	(!mask_image || mask_image->common.flags & FAST_PATH_NARROW_FORMAT) &&
+	(dest_image->common.flags & FAST_PATH_NARROW_FORMAT))
+    {
+	narrow = ITER_NARROW;
+	Bpp = 4;
+    }
+    else
+    {
+	narrow = 0;
+	Bpp = 16;
+    }
 
     if (width * Bpp > SCANLINE_BUFFER_LENGTH)
     {
@@ -87,172 +145,60 @@ general_composite_rect  (pixman_implementation_t *imp,
     mask_buffer = src_buffer + width * Bpp;
     dest_buffer = mask_buffer + width * Bpp;
 
-    src_class = _pixman_image_classify (src,
-                                        src_x, src_y,
-                                        width, height);
-
-    mask_class = SOURCE_IMAGE_CLASS_UNKNOWN;
-
-    if (mask)
+    if (!narrow)
     {
-	mask_class = _pixman_image_classify (mask,
-	                                     src_x, src_y,
-	                                     width, height);
+	/* To make sure there aren't any NANs in the buffers */
+	memset (src_buffer, 0, width * Bpp);
+	memset (mask_buffer, 0, width * Bpp);
+	memset (dest_buffer, 0, width * Bpp);
     }
+    
+    /* src iter */
+    src_iter_flags = narrow | op_flags[op].src;
 
-    if (op == PIXMAN_OP_CLEAR)
-	fetch_src = NULL;
-    else if (narrow)
-	fetch_src = _pixman_image_get_scanline_32;
-    else
-	fetch_src = _pixman_image_get_scanline_64;
+    _pixman_implementation_src_iter_init (imp->toplevel, &src_iter, src_image,
+					  src_x, src_y, width, height,
+					  src_buffer, src_iter_flags, info->src_flags);
 
-    if (!mask || op == PIXMAN_OP_CLEAR)
-	fetch_mask = NULL;
-    else if (narrow)
-	fetch_mask = _pixman_image_get_scanline_32;
-    else
-	fetch_mask = _pixman_image_get_scanline_64;
-
-    if (op == PIXMAN_OP_CLEAR || op == PIXMAN_OP_SRC)
-	fetch_dest = NULL;
-    else if (narrow)
-	fetch_dest = _pixman_image_get_scanline_32;
-    else
-	fetch_dest = _pixman_image_get_scanline_64;
-
-    if (narrow)
-	store = _pixman_image_store_scanline_32;
-    else
-	store = _pixman_image_store_scanline_64;
-
-    /* Skip the store step and composite directly into the
-     * destination if the output format of the compose func matches
-     * the destination format.
-     *
-     * If the destination format is a8r8g8b8 then we can always do
-     * this. If it is x8r8g8b8, then we can only do it if the
-     * operator doesn't make use of destination alpha.
-     */
-    if ((dest->bits.format == PIXMAN_a8r8g8b8)	||
-	(dest->bits.format == PIXMAN_x8r8g8b8	&&
-	 (op == PIXMAN_OP_OVER		||
-	  op == PIXMAN_OP_ADD		||
-	  op == PIXMAN_OP_SRC		||
-	  op == PIXMAN_OP_CLEAR		||
-	  op == PIXMAN_OP_IN_REVERSE	||
-	  op == PIXMAN_OP_OUT_REVERSE	||
-	  op == PIXMAN_OP_DST)))
+    /* mask iter */
+    if ((src_iter_flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) ==
+	(ITER_IGNORE_ALPHA | ITER_IGNORE_RGB))
     {
-	if (narrow &&
-	    !dest->common.alpha_map &&
-	    !dest->bits.write_func)
-	{
-	    store = NULL;
-	}
-    }
-
-    if (!store)
-    {
-	bits = dest->bits.bits;
-	stride = dest->bits.rowstride;
-    }
-    else
-    {
-	bits = NULL;
-	stride = 0;
+	/* If it doesn't matter what the source is, then it doesn't matter
+	 * what the mask is
+	 */
+	mask_image = NULL;
     }
 
     component_alpha =
-        fetch_src                       &&
-        fetch_mask                      &&
-        mask                            &&
-        mask->common.type == BITS       &&
-        mask->common.component_alpha    &&
-        PIXMAN_FORMAT_RGB (mask->bits.format);
+        mask_image			      &&
+        mask_image->common.type == BITS       &&
+        mask_image->common.component_alpha    &&
+        PIXMAN_FORMAT_RGB (mask_image->bits.format);
 
-    if (narrow)
-    {
-	if (component_alpha)
-	    compose = _pixman_implementation_combine_32_ca;
-	else
-	    compose = _pixman_implementation_combine_32;
-    }
-    else
-    {
-	if (component_alpha)
-	    compose = (pixman_combine_32_func_t)_pixman_implementation_combine_64_ca;
-	else
-	    compose = (pixman_combine_32_func_t)_pixman_implementation_combine_64;
-    }
+    _pixman_implementation_src_iter_init (
+	imp->toplevel, &mask_iter, mask_image, mask_x, mask_y, width, height,
+	mask_buffer, narrow | (component_alpha? 0 : ITER_IGNORE_RGB), info->mask_flags);
 
-    if (!compose)
-	return;
+    /* dest iter */
+    _pixman_implementation_dest_iter_init (
+	imp->toplevel, &dest_iter, dest_image, dest_x, dest_y, width, height,
+	dest_buffer, narrow | op_flags[op].dst, info->dest_flags);
 
-    if (!fetch_mask)
-	mask_buffer = NULL;
+    compose = _pixman_implementation_lookup_combiner (
+	imp->toplevel, op, component_alpha, narrow);
 
     for (i = 0; i < height; ++i)
     {
-	/* fill first half of scanline with source */
-	if (fetch_src)
-	{
-	    if (fetch_mask)
-	    {
-		/* fetch mask before source so that fetching of
-		   source can be optimized */
-		fetch_mask (mask, mask_x, mask_y + i,
-		            width, (void *)mask_buffer, 0);
+	uint32_t *s, *m, *d;
 
-		if (mask_class == SOURCE_IMAGE_CLASS_HORIZONTAL)
-		    fetch_mask = NULL;
-	    }
+	m = mask_iter.get_scanline (&mask_iter, NULL);
+	s = src_iter.get_scanline (&src_iter, m);
+	d = dest_iter.get_scanline (&dest_iter, NULL);
 
-	    if (src_class == SOURCE_IMAGE_CLASS_HORIZONTAL)
-	    {
-		fetch_src (src, src_x, src_y + i,
-		           width, (void *)src_buffer, 0);
-		fetch_src = NULL;
-	    }
-	    else
-	    {
-		fetch_src (src, src_x, src_y + i,
-		           width, (void *)src_buffer, (void *)mask_buffer);
-	    }
-	}
-	else if (fetch_mask)
-	{
-	    fetch_mask (mask, mask_x, mask_y + i,
-	                width, (void *)mask_buffer, 0);
-	}
+	compose (imp->toplevel, op, d, s, m, width);
 
-	if (store)
-	{
-	    /* fill dest into second half of scanline */
-	    if (fetch_dest)
-	    {
-		fetch_dest (dest, dest_x, dest_y + i,
-		            width, (void *)dest_buffer, 0);
-	    }
-
-	    /* blend */
-	    compose (imp->toplevel, op,
-		     (void *)dest_buffer,
-		     (void *)src_buffer,
-		     (void *)mask_buffer,
-		     width);
-
-	    /* write back */
-	    store (&(dest->bits), dest_x, dest_y + i, width,
-	           (void *)dest_buffer);
-	}
-	else
-	{
-	    /* blend */
-	    compose (imp->toplevel, op,
-		     bits + (dest_y + i) * stride + dest_x,
-	             (void *)src_buffer, (void *)mask_buffer, width);
-	}
+	dest_iter.write_back (&dest_iter);
     }
 
     if (scanline_buffer != (uint8_t *) stack_scanline_buffer)
@@ -265,50 +211,16 @@ static const pixman_fast_path_t general_fast_path[] =
     { PIXMAN_OP_NONE }
 };
 
-static pixman_bool_t
-general_blt (pixman_implementation_t *imp,
-             uint32_t *               src_bits,
-             uint32_t *               dst_bits,
-             int                      src_stride,
-             int                      dst_stride,
-             int                      src_bpp,
-             int                      dst_bpp,
-             int                      src_x,
-             int                      src_y,
-             int                      dst_x,
-             int                      dst_y,
-             int                      width,
-             int                      height)
-{
-    /* We can't blit unless we have sse2 or mmx */
-
-    return FALSE;
-}
-
-static pixman_bool_t
-general_fill (pixman_implementation_t *imp,
-              uint32_t *               bits,
-              int                      stride,
-              int                      bpp,
-              int                      x,
-              int                      y,
-              int                      width,
-              int                      height,
-              uint32_t xor)
-{
-    return FALSE;
-}
-
 pixman_implementation_t *
 _pixman_implementation_create_general (void)
 {
     pixman_implementation_t *imp = _pixman_implementation_create (NULL, general_fast_path);
 
     _pixman_setup_combiner_functions_32 (imp);
-    _pixman_setup_combiner_functions_64 (imp);
+    _pixman_setup_combiner_functions_float (imp);
 
-    imp->blt = general_blt;
-    imp->fill = general_fill;
+    imp->src_iter_init = general_src_iter_init;
+    imp->dest_iter_init = general_dest_iter_init;
 
     return imp;
 }
diff --git a/programs/develop/libraries/pixman/pixman-glyph.c b/programs/develop/libraries/pixman/pixman-glyph.c
new file mode 100644
index 0000000000..5a271b64b8
--- /dev/null
+++ b/programs/develop/libraries/pixman/pixman-glyph.c
@@ -0,0 +1,670 @@
+/*
+ * Copyright 2010, 2012, Soren Sandmann <sandmann@cs.au.dk>
+ * Copyright 2010, 2011, 2012, Red Hat, Inc
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Soren Sandmann <sandmann@cs.au.dk>
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include "pixman-private.h"
+
+#include <stdlib.h>
+
+typedef struct glyph_metrics_t glyph_metrics_t;
+typedef struct glyph_t glyph_t;
+
+#define TOMBSTONE ((glyph_t *)0x1)
+
+/* XXX: These numbers are arbitrary---we've never done any measurements.
+ */
+#define N_GLYPHS_HIGH_WATER  (16384)
+#define N_GLYPHS_LOW_WATER   (8192)
+#define HASH_SIZE (2 * N_GLYPHS_HIGH_WATER)
+#define HASH_MASK (HASH_SIZE - 1)
+
+struct glyph_t
+{
+    void *		font_key;
+    void *		glyph_key;
+    int			origin_x;
+    int			origin_y;
+    pixman_image_t *	image;
+    pixman_link_t	mru_link;
+};
+
+struct pixman_glyph_cache_t
+{
+    int			n_glyphs;
+    int			n_tombstones;
+    int			freeze_count;
+    pixman_list_t	mru;
+    glyph_t *		glyphs[HASH_SIZE];
+};
+
+static void
+free_glyph (glyph_t *glyph)
+{
+    pixman_list_unlink (&glyph->mru_link);
+    pixman_image_unref (glyph->image);
+    free (glyph);
+}
+
+static unsigned int
+hash (const void *font_key, const void *glyph_key)
+{
+    size_t key = (size_t)font_key + (size_t)glyph_key;
+
+    /* This hash function is based on one found on Thomas Wang's
+     * web page at
+     *
+     *    http://www.concentric.net/~Ttwang/tech/inthash.htm
+     *
+     */
+    key = (key << 15) - key - 1;
+    key = key ^ (key >> 12);
+    key = key + (key << 2);
+    key = key ^ (key >> 4);
+    key = key + (key << 3) + (key << 11);
+    key = key ^ (key >> 16);
+
+    return key;
+}
+
+static glyph_t *
+lookup_glyph (pixman_glyph_cache_t *cache,
+	      void                 *font_key,
+	      void                 *glyph_key)
+{
+    unsigned idx;
+    glyph_t *g;
+
+    idx = hash (font_key, glyph_key);
+    while ((g = cache->glyphs[idx++ & HASH_MASK]))
+    {
+	if (g != TOMBSTONE			&&
+	    g->font_key == font_key		&&
+	    g->glyph_key == glyph_key)
+	{
+	    return g;
+	}
+    }
+
+    return NULL;
+}
+
+static void
+insert_glyph (pixman_glyph_cache_t *cache,
+	      glyph_t              *glyph)
+{
+    unsigned idx;
+    glyph_t **loc;
+
+    idx = hash (glyph->font_key, glyph->glyph_key);
+
+    /* Note: we assume that there is room in the table. If there isn't,
+     * this will be an infinite loop.
+     */
+    do
+    {
+	loc = &cache->glyphs[idx++ & HASH_MASK];
+    } while (*loc && *loc != TOMBSTONE);
+
+    if (*loc == TOMBSTONE)
+	cache->n_tombstones--;
+    cache->n_glyphs++;
+
+    *loc = glyph;
+}
+
+static void
+remove_glyph (pixman_glyph_cache_t *cache,
+	      glyph_t              *glyph)
+{
+    unsigned idx;
+
+    idx = hash (glyph->font_key, glyph->glyph_key);
+    while (cache->glyphs[idx & HASH_MASK] != glyph)
+	idx++;
+
+    cache->glyphs[idx & HASH_MASK] = TOMBSTONE;
+    cache->n_tombstones++;
+    cache->n_glyphs--;
+
+    /* Eliminate tombstones if possible */
+    if (cache->glyphs[(idx + 1) & HASH_MASK] == NULL)
+    {
+	while (cache->glyphs[idx & HASH_MASK] == TOMBSTONE)
+	{
+	    cache->glyphs[idx & HASH_MASK] = NULL;
+	    cache->n_tombstones--;
+	    idx--;
+	}
+    }
+}
+
+static void
+clear_table (pixman_glyph_cache_t *cache)
+{
+    int i;
+
+    for (i = 0; i < HASH_SIZE; ++i)
+    {
+	glyph_t *glyph = cache->glyphs[i];
+
+	if (glyph && glyph != TOMBSTONE)
+	    free_glyph (glyph);
+
+	cache->glyphs[i] = NULL;
+    }
+
+    cache->n_glyphs = 0;
+    cache->n_tombstones = 0;
+}
+
+PIXMAN_EXPORT pixman_glyph_cache_t *
+pixman_glyph_cache_create (void)
+{
+    pixman_glyph_cache_t *cache;
+
+    if (!(cache = malloc (sizeof *cache)))
+	return NULL;
+
+    memset (cache->glyphs, 0, sizeof (cache->glyphs));
+    cache->n_glyphs = 0;
+    cache->n_tombstones = 0;
+    cache->freeze_count = 0;
+
+    pixman_list_init (&cache->mru);
+
+    return cache;
+}
+
+PIXMAN_EXPORT void
+pixman_glyph_cache_destroy (pixman_glyph_cache_t *cache)
+{
+    return_if_fail (cache->freeze_count == 0);
+
+    clear_table (cache);
+
+    free (cache);
+}
+
+PIXMAN_EXPORT void
+pixman_glyph_cache_freeze (pixman_glyph_cache_t  *cache)
+{
+    cache->freeze_count++;
+}
+
+PIXMAN_EXPORT void
+pixman_glyph_cache_thaw (pixman_glyph_cache_t  *cache)
+{
+    if (--cache->freeze_count == 0					&&
+	cache->n_glyphs + cache->n_tombstones > N_GLYPHS_HIGH_WATER)
+    {
+	if (cache->n_tombstones > N_GLYPHS_HIGH_WATER)
+	{
+	    /* More than half the entries are
+	     * tombstones. Just dump the whole table.
+	     */
+	    clear_table (cache);
+	}
+
+	while (cache->n_glyphs > N_GLYPHS_LOW_WATER)
+	{
+	    glyph_t *glyph = CONTAINER_OF (glyph_t, mru_link, cache->mru.tail);
+
+	    remove_glyph (cache, glyph);
+	    free_glyph (glyph);
+	}
+    }
+}
+
+PIXMAN_EXPORT const void *
+pixman_glyph_cache_lookup (pixman_glyph_cache_t  *cache,
+			   void                  *font_key,
+			   void                  *glyph_key)
+{
+    return lookup_glyph (cache, font_key, glyph_key);
+}
+
+PIXMAN_EXPORT const void *
+pixman_glyph_cache_insert (pixman_glyph_cache_t  *cache,
+			   void                  *font_key,
+			   void                  *glyph_key,
+			   int			  origin_x,
+			   int                    origin_y,
+			   pixman_image_t        *image)
+{
+    glyph_t *glyph;
+    int32_t width, height;
+
+    return_val_if_fail (cache->freeze_count > 0, NULL);
+    return_val_if_fail (image->type == BITS, NULL);
+
+    width = image->bits.width;
+    height = image->bits.height;
+
+    if (cache->n_glyphs >= HASH_SIZE)
+	return NULL;
+
+    if (!(glyph = malloc (sizeof *glyph)))
+	return NULL;
+
+    glyph->font_key = font_key;
+    glyph->glyph_key = glyph_key;
+    glyph->origin_x = origin_x;
+    glyph->origin_y = origin_y;
+
+    if (!(glyph->image = pixman_image_create_bits (
+	      image->bits.format, width, height, NULL, -1)))
+    {
+	free (glyph);
+	return NULL;
+    }
+
+    pixman_image_composite32 (PIXMAN_OP_SRC,
+			      image, NULL, glyph->image, 0, 0, 0, 0, 0, 0,
+			      width, height);
+
+    if (PIXMAN_FORMAT_A   (glyph->image->bits.format) != 0	&&
+	PIXMAN_FORMAT_RGB (glyph->image->bits.format) != 0)
+    {
+	pixman_image_set_component_alpha (glyph->image, TRUE);
+    }
+
+    pixman_list_prepend (&cache->mru, &glyph->mru_link);
+
+    _pixman_image_validate (glyph->image);
+    insert_glyph (cache, glyph);
+
+    return glyph;
+}
+
+PIXMAN_EXPORT void
+pixman_glyph_cache_remove (pixman_glyph_cache_t  *cache,
+			   void                  *font_key,
+			   void                  *glyph_key)
+{
+    glyph_t *glyph;
+
+    if ((glyph = lookup_glyph (cache, font_key, glyph_key)))
+    {
+	remove_glyph (cache, glyph);
+
+	free_glyph (glyph);
+    }
+}
+
+PIXMAN_EXPORT void
+pixman_glyph_get_extents (pixman_glyph_cache_t *cache,
+			  int                   n_glyphs,
+			  pixman_glyph_t       *glyphs,
+			  pixman_box32_t       *extents)
+{
+    int i;
+
+    extents->x1 = extents->y1 = INT32_MAX;
+    extents->x2 = extents->y2 = INT32_MIN;
+
+    for (i = 0; i < n_glyphs; ++i)
+    {
+	glyph_t *glyph = (glyph_t *)glyphs[i].glyph;
+	int x1, y1, x2, y2;
+
+	x1 = glyphs[i].x - glyph->origin_x;
+	y1 = glyphs[i].y - glyph->origin_y;
+	x2 = glyphs[i].x - glyph->origin_x + glyph->image->bits.width;
+	y2 = glyphs[i].y - glyph->origin_y + glyph->image->bits.height;
+
+	if (x1 < extents->x1)
+	    extents->x1 = x1;
+	if (y1 < extents->y1)
+	    extents->y1 = y1;
+	if (x2 > extents->x2)
+	    extents->x2 = x2;
+	if (y2 > extents->y2)
+	    extents->y2 = y2;
+    }
+}
+
+/* This function returns a format that is suitable for use as a mask for the
+ * set of glyphs in question.
+ */
+PIXMAN_EXPORT pixman_format_code_t
+pixman_glyph_get_mask_format (pixman_glyph_cache_t *cache,
+			      int		    n_glyphs,
+			      const pixman_glyph_t *glyphs)
+{
+    pixman_format_code_t format = PIXMAN_a1;
+    int i;
+
+    for (i = 0; i < n_glyphs; ++i)
+    {
+	const glyph_t *glyph = glyphs[i].glyph;
+	pixman_format_code_t glyph_format = glyph->image->bits.format;
+
+	if (PIXMAN_FORMAT_TYPE (glyph_format) == PIXMAN_TYPE_A)
+	{
+	    if (PIXMAN_FORMAT_A (glyph_format) > PIXMAN_FORMAT_A (format))
+		format = glyph_format;
+	}
+	else
+	{
+	    return PIXMAN_a8r8g8b8;
+	}
+    }
+
+    return format;
+}
+
+static pixman_bool_t
+box32_intersect (pixman_box32_t *dest,
+		 const pixman_box32_t *box1,
+		 const pixman_box32_t *box2)
+{
+    dest->x1 = MAX (box1->x1, box2->x1);
+    dest->y1 = MAX (box1->y1, box2->y1);
+    dest->x2 = MIN (box1->x2, box2->x2);
+    dest->y2 = MIN (box1->y2, box2->y2);
+
+    return dest->x2 > dest->x1 && dest->y2 > dest->y1;
+}
+
+PIXMAN_EXPORT void
+pixman_composite_glyphs_no_mask (pixman_op_t            op,
+				 pixman_image_t        *src,
+				 pixman_image_t        *dest,
+				 int32_t                src_x,
+				 int32_t                src_y,
+				 int32_t                dest_x,
+				 int32_t                dest_y,
+				 pixman_glyph_cache_t  *cache,
+				 int                    n_glyphs,
+				 const pixman_glyph_t  *glyphs)
+{
+    pixman_region32_t region;
+    pixman_format_code_t glyph_format = PIXMAN_null;
+    uint32_t glyph_flags = 0;
+    pixman_format_code_t dest_format;
+    uint32_t dest_flags;
+    pixman_composite_func_t func = NULL;
+    pixman_implementation_t *implementation = NULL;
+    pixman_composite_info_t info;
+    int i;
+
+    _pixman_image_validate (src);
+    _pixman_image_validate (dest);
+    
+    dest_format = dest->common.extended_format_code;
+    dest_flags = dest->common.flags;
+    
+    pixman_region32_init (&region);
+    if (!_pixman_compute_composite_region32 (
+	    &region,
+	    src, NULL, dest,
+	    src_x - dest_x, src_y - dest_y, 0, 0, 0, 0,
+	    dest->bits.width, dest->bits.height))
+    {
+	goto out;
+    }
+
+    info.op = op;
+    info.src_image = src;
+    info.dest_image = dest;
+    info.src_flags = src->common.flags;
+    info.dest_flags = dest->common.flags;
+
+    for (i = 0; i < n_glyphs; ++i)
+    {
+	glyph_t *glyph = (glyph_t *)glyphs[i].glyph;
+	pixman_image_t *glyph_img = glyph->image;
+	pixman_box32_t glyph_box;
+	pixman_box32_t *pbox;
+	uint32_t extra = FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
+	pixman_box32_t composite_box;
+	int n;
+
+	glyph_box.x1 = dest_x + glyphs[i].x - glyph->origin_x;
+	glyph_box.y1 = dest_y + glyphs[i].y - glyph->origin_y;
+	glyph_box.x2 = glyph_box.x1 + glyph->image->bits.width;
+	glyph_box.y2 = glyph_box.y1 + glyph->image->bits.height;
+	
+	pbox = pixman_region32_rectangles (&region, &n);
+	
+	info.mask_image = glyph_img;
+
+	while (n--)
+	{
+	    if (box32_intersect (&composite_box, pbox, &glyph_box))
+	    {
+		if (glyph_img->common.extended_format_code != glyph_format	||
+		    glyph_img->common.flags != glyph_flags)
+		{
+		    glyph_format = glyph_img->common.extended_format_code;
+		    glyph_flags = glyph_img->common.flags;
+
+		    _pixman_implementation_lookup_composite (
+			get_implementation(), op,
+			src->common.extended_format_code, src->common.flags,
+			glyph_format, glyph_flags | extra,
+			dest_format, dest_flags,
+			&implementation, &func);
+		}
+
+		info.src_x = src_x + composite_box.x1 - dest_x;
+		info.src_y = src_y + composite_box.y1 - dest_y;
+		info.mask_x = composite_box.x1 - (dest_x + glyphs[i].x - glyph->origin_x);
+		info.mask_y = composite_box.y1 - (dest_y + glyphs[i].y - glyph->origin_y);
+		info.dest_x = composite_box.x1;
+		info.dest_y = composite_box.y1;
+		info.width = composite_box.x2 - composite_box.x1;
+		info.height = composite_box.y2 - composite_box.y1;
+
+		info.mask_flags = glyph_flags;
+
+		func (implementation, &info);
+	    }
+
+	    pbox++;
+	}
+	pixman_list_move_to_front (&cache->mru, &glyph->mru_link);
+    }
+
+out:
+    pixman_region32_fini (&region);
+}
+
+static void
+add_glyphs (pixman_glyph_cache_t *cache,
+	    pixman_image_t *dest,
+	    int off_x, int off_y,
+	    int n_glyphs, const pixman_glyph_t *glyphs)
+{
+    pixman_format_code_t glyph_format = PIXMAN_null;
+    uint32_t glyph_flags = 0;
+    pixman_composite_func_t func = NULL;
+    pixman_implementation_t *implementation = NULL;
+    pixman_format_code_t dest_format;
+    uint32_t dest_flags;
+    pixman_box32_t dest_box;
+    pixman_composite_info_t info;
+    pixman_image_t *white_img = NULL;
+    pixman_bool_t white_src = FALSE;
+    int i;
+
+    _pixman_image_validate (dest);
+
+    dest_format = dest->common.extended_format_code;
+    dest_flags = dest->common.flags;
+
+    info.op = PIXMAN_OP_ADD;
+    info.dest_image = dest;
+    info.src_x = 0;
+    info.src_y = 0;
+    info.dest_flags = dest_flags;
+
+    dest_box.x1 = 0;
+    dest_box.y1 = 0;
+    dest_box.x2 = dest->bits.width;
+    dest_box.y2 = dest->bits.height;
+
+    for (i = 0; i < n_glyphs; ++i)
+    {
+	glyph_t *glyph = (glyph_t *)glyphs[i].glyph;
+	pixman_image_t *glyph_img = glyph->image;
+	pixman_box32_t glyph_box;
+	pixman_box32_t composite_box;
+
+	if (glyph_img->common.extended_format_code != glyph_format	||
+	    glyph_img->common.flags != glyph_flags)
+	{
+	    pixman_format_code_t src_format, mask_format;
+
+	    glyph_format = glyph_img->common.extended_format_code;
+	    glyph_flags = glyph_img->common.flags;
+
+	    if (glyph_format == dest->bits.format)
+	    {
+		src_format = glyph_format;
+		mask_format = PIXMAN_null;
+		info.src_flags = glyph_flags | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
+		info.mask_flags = FAST_PATH_IS_OPAQUE;
+		info.mask_image = NULL;
+		white_src = FALSE;
+	    }
+	    else
+	    {
+		if (!white_img)
+		{
+		    static const pixman_color_t white = { 0xffff, 0xffff, 0xffff, 0xffff };
+
+		    if (!(white_img = pixman_image_create_solid_fill (&white)))
+			goto out;
+
+		    _pixman_image_validate (white_img);
+		}
+
+		src_format = PIXMAN_solid;
+		mask_format = glyph_format;
+		info.src_flags = white_img->common.flags;
+		info.mask_flags = glyph_flags | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
+		info.src_image = white_img;
+		white_src = TRUE;
+	    }
+
+	    _pixman_implementation_lookup_composite (
+		get_implementation(), PIXMAN_OP_ADD,
+		src_format, info.src_flags,
+		mask_format, info.mask_flags,
+		dest_format, dest_flags,
+		&implementation, &func);
+	}
+
+	glyph_box.x1 = glyphs[i].x - glyph->origin_x + off_x;
+	glyph_box.y1 = glyphs[i].y - glyph->origin_y + off_y;
+	glyph_box.x2 = glyph_box.x1 + glyph->image->bits.width;
+	glyph_box.y2 = glyph_box.y1 + glyph->image->bits.height;
+	
+	if (box32_intersect (&composite_box, &glyph_box, &dest_box))
+	{
+	    int src_x = composite_box.x1 - glyph_box.x1;
+	    int src_y = composite_box.y1 - glyph_box.y1;
+
+	    if (white_src)
+		info.mask_image = glyph_img;
+	    else
+		info.src_image = glyph_img;
+
+	    info.mask_x = info.src_x = src_x;
+	    info.mask_y = info.src_y = src_y;
+	    info.dest_x = composite_box.x1;
+	    info.dest_y = composite_box.y1;
+	    info.width = composite_box.x2 - composite_box.x1;
+	    info.height = composite_box.y2 - composite_box.y1;
+
+	    func (implementation, &info);
+
+	    pixman_list_move_to_front (&cache->mru, &glyph->mru_link);
+	}
+    }
+
+out:
+    if (white_img)
+	pixman_image_unref (white_img);
+}
+
+/* Conceptually, for each glyph, (white IN glyph) is PIXMAN_OP_ADDed to an
+ * infinitely big mask image at the position such that the glyph origin point
+ * is positioned at the (glyphs[i].x, glyphs[i].y) point.
+ *
+ * Then (mask_x, mask_y) in the infinite mask and (src_x, src_y) in the source
+ * image are both aligned with (dest_x, dest_y) in the destination image. Then
+ * these three images are composited within the 
+ *
+ *       (dest_x, dest_y, dst_x + width, dst_y + height)
+ *
+ * rectangle.
+ *
+ * TODO:
+ *   - Trim the mask to the destination clip/image?
+ *   - Trim composite region based on sources, when the op ignores 0s.
+ */
+PIXMAN_EXPORT void
+pixman_composite_glyphs (pixman_op_t            op,
+			 pixman_image_t        *src,
+			 pixman_image_t        *dest,
+			 pixman_format_code_t   mask_format,
+			 int32_t                src_x,
+			 int32_t                src_y,
+			 int32_t		mask_x,
+			 int32_t		mask_y,
+			 int32_t                dest_x,
+			 int32_t                dest_y,
+			 int32_t                width,
+			 int32_t                height,
+			 pixman_glyph_cache_t  *cache,
+			 int			n_glyphs,
+			 const pixman_glyph_t  *glyphs)
+{
+    pixman_image_t *mask;
+
+    if (!(mask = pixman_image_create_bits (mask_format, width, height, NULL, -1)))
+	return;
+
+    if (PIXMAN_FORMAT_A   (mask_format) != 0 &&
+	PIXMAN_FORMAT_RGB (mask_format) != 0)
+    {
+	pixman_image_set_component_alpha (mask, TRUE);
+    }
+
+    add_glyphs (cache, mask, - mask_x, - mask_y, n_glyphs, glyphs);
+
+    pixman_image_composite32 (op, src, mask, dest,
+			      src_x, src_y,
+			      0, 0,
+			      dest_x, dest_y,
+			      width, height);
+
+    pixman_image_unref (mask);
+}
diff --git a/programs/develop/libraries/pixman/pixman-gradient-walker.c b/programs/develop/libraries/pixman/pixman-gradient-walker.c
index dd666b4120..5944a559ad 100644
--- a/programs/develop/libraries/pixman/pixman-gradient-walker.c
+++ b/programs/develop/libraries/pixman/pixman-gradient-walker.c
@@ -31,123 +31,71 @@
 void
 _pixman_gradient_walker_init (pixman_gradient_walker_t *walker,
                               gradient_t *              gradient,
-                              unsigned int              spread)
+                              pixman_repeat_t		repeat)
 {
     walker->num_stops = gradient->n_stops;
     walker->stops     = gradient->stops;
     walker->left_x    = 0;
     walker->right_x   = 0x10000;
-    walker->stepper   = 0;
-    walker->left_ag   = 0;
-    walker->left_rb   = 0;
-    walker->right_ag  = 0;
-    walker->right_rb  = 0;
-    walker->spread    = spread;
+    walker->a_s       = 0.0f;
+    walker->a_b       = 0.0f;
+    walker->r_s       = 0.0f;
+    walker->r_b       = 0.0f;
+    walker->g_s       = 0.0f;
+    walker->g_b       = 0.0f;
+    walker->b_s       = 0.0f;
+    walker->b_b       = 0.0f;
+    walker->repeat    = repeat;
 
     walker->need_reset = TRUE;
 }
 
-void
-_pixman_gradient_walker_reset (pixman_gradient_walker_t *walker,
-                               pixman_fixed_32_32_t      pos)
+static void
+gradient_walker_reset (pixman_gradient_walker_t *walker,
+		       pixman_fixed_48_16_t      pos)
 {
     int32_t x, left_x, right_x;
-    pixman_color_t          *left_c, *right_c;
+    pixman_color_t *left_c, *right_c;
     int n, count = walker->num_stops;
-    pixman_gradient_stop_t *      stops = walker->stops;
+    pixman_gradient_stop_t *stops = walker->stops;
+    float la, lr, lg, lb;
+    float ra, rr, rg, rb;
+    float lx, rx;
 
-    static const pixman_color_t transparent_black = { 0, 0, 0, 0 };
-
-    switch (walker->spread)
+    if (walker->repeat == PIXMAN_REPEAT_NORMAL)
     {
-    case PIXMAN_REPEAT_NORMAL:
-	x = (int32_t)pos & 0xFFFF;
-	for (n = 0; n < count; n++)
-	    if (x < stops[n].x)
-		break;
-	if (n == 0)
-	{
-	    left_x =  stops[count - 1].x - 0x10000;
-	    left_c = &stops[count - 1].color;
-	}
-	else
-	{
-	    left_x =  stops[n - 1].x;
-	    left_c = &stops[n - 1].color;
-	}
-
-	if (n == count)
-	{
-	    right_x =  stops[0].x + 0x10000;
-	    right_c = &stops[0].color;
-	}
-	else
-	{
-	    right_x =  stops[n].x;
-	    right_c = &stops[n].color;
-	}
-	left_x  += (pos - x);
-	right_x += (pos - x);
-	break;
-
-    case PIXMAN_REPEAT_PAD:
-	for (n = 0; n < count; n++)
-	    if (pos < stops[n].x)
-		break;
-
-	if (n == 0)
-	{
-	    left_x =  INT32_MIN;
-	    left_c = &stops[0].color;
-	}
-	else
-	{
-	    left_x =  stops[n - 1].x;
-	    left_c = &stops[n - 1].color;
-	}
-
-	if (n == count)
-	{
-	    right_x =  INT32_MAX;
-	    right_c = &stops[n - 1].color;
-	}
-	else
-	{
-	    right_x =  stops[n].x;
-	    right_c = &stops[n].color;
-	}
-	break;
-
-    case PIXMAN_REPEAT_REFLECT:
-	x = (int32_t)pos & 0xFFFF;
+	x = (int32_t)pos & 0xffff;
+    }
+    else if (walker->repeat == PIXMAN_REPEAT_REFLECT)
+    {
+	x = (int32_t)pos & 0xffff;
 	if ((int32_t)pos & 0x10000)
 	    x = 0x10000 - x;
-	for (n = 0; n < count; n++)
-	    if (x < stops[n].x)
-		break;
-
-	if (n == 0)
-	{
-	    left_x =  -stops[0].x;
-	    left_c = &stops[0].color;
-	}
-	else
-	{
-	    left_x =  stops[n - 1].x;
-	    left_c = &stops[n - 1].color;
-	}
-
-	if (n == count)
-	{
-	    right_x = 0x20000 - stops[n - 1].x;
-	    right_c = &stops[n - 1].color;
-	}
-	else
-	{
-	    right_x =  stops[n].x;
-	    right_c = &stops[n].color;
-	}
+    }
+    else
+    {
+	x = pos;
+    }
+    
+    for (n = 0; n < count; n++)
+    {
+	if (x < stops[n].x)
+	    break;
+    }
+    
+    left_x =  stops[n - 1].x;
+    left_c = &stops[n - 1].color;
+    
+    right_x =  stops[n].x;
+    right_c = &stops[n].color;
 
+    if (walker->repeat == PIXMAN_REPEAT_NORMAL)
+    {
+	left_x  += (pos - x);
+	right_x += (pos - x);
+    }
+    else if (walker->repeat == PIXMAN_REPEAT_REFLECT)
+    {
 	if ((int32_t)pos & 0x10000)
 	{
 	    pixman_color_t  *tmp_c;
@@ -165,90 +113,90 @@ _pixman_gradient_walker_reset (pixman_gradient_walker_t *walker,
 	}
 	left_x  += (pos - x);
 	right_x += (pos - x);
-	break;
-
-    default:  /* REPEAT_NONE */
-	for (n = 0; n < count; n++)
-	    if (pos < stops[n].x)
-		break;
-
+    }
+    else if (walker->repeat == PIXMAN_REPEAT_NONE)
+    {
 	if (n == 0)
-	{
-	    left_x  =  INT32_MIN;
-	    right_x =  stops[0].x;
-	    left_c  = right_c = (pixman_color_t*) &transparent_black;
-	}
+	    right_c = left_c;
 	else if (n == count)
-	{
-	    left_x  = stops[n - 1].x;
-	    right_x = INT32_MAX;
-	    left_c  = right_c = (pixman_color_t*) &transparent_black;
-	}
-	else
-	{
-	    left_x  =  stops[n - 1].x;
-	    right_x =  stops[n].x;
-	    left_c  = &stops[n - 1].color;
-	    right_c = &stops[n].color;
-	}
+	    left_c = right_c;
     }
 
-    walker->left_x   = left_x;
-    walker->right_x  = right_x;
-    walker->left_ag  = ((left_c->alpha >> 8) << 16)   | (left_c->green >> 8);
-    walker->left_rb  = ((left_c->red & 0xff00) << 8)  | (left_c->blue >> 8);
-    walker->right_ag = ((right_c->alpha >> 8) << 16)  | (right_c->green >> 8);
-    walker->right_rb = ((right_c->red & 0xff00) << 8) | (right_c->blue >> 8);
+    /* The alpha channel is scaled to be in the [0, 255] interval,
+     * and the red/green/blue channels are scaled to be in [0, 1].
+     * This ensures that after premultiplication all channels will
+     * be in the [0, 255] interval.
+     */
+    la = (left_c->alpha * (1.0f/257.0f));
+    lr = (left_c->red * (1.0f/257.0f));
+    lg = (left_c->green * (1.0f/257.0f));
+    lb = (left_c->blue * (1.0f/257.0f));
 
-    if (walker->left_x == walker->right_x                ||
-        ( walker->left_ag == walker->right_ag &&
-          walker->left_rb == walker->right_rb )   )
+    ra = (right_c->alpha * (1.0f/257.0f));
+    rr = (right_c->red * (1.0f/257.0f));
+    rg = (right_c->green * (1.0f/257.0f));
+    rb = (right_c->blue * (1.0f/257.0f));
+    
+    lx = left_x * (1.0f/65536.0f);
+    rx = right_x * (1.0f/65536.0f);
+    
+    if (FLOAT_IS_ZERO (rx - lx) || left_x == INT32_MIN || right_x == INT32_MAX)
     {
-	walker->stepper = 0;
+	walker->a_s = walker->r_s = walker->g_s = walker->b_s = 0.0f;
+	walker->a_b = (la + ra) / 2.0f;
+	walker->r_b = (lr + rr) / 510.0f;
+	walker->g_b = (lg + rg) / 510.0f;
+	walker->b_b = (lb + rb) / 510.0f;
     }
     else
     {
-	int32_t width = right_x - left_x;
-	walker->stepper = ((1 << 24) + width / 2) / width;
+	float w_rec = 1.0f / (rx - lx);
+
+	walker->a_b = (la * rx - ra * lx) * w_rec;
+	walker->r_b = (lr * rx - rr * lx) * w_rec * (1.0f/255.0f);
+	walker->g_b = (lg * rx - rg * lx) * w_rec * (1.0f/255.0f);
+	walker->b_b = (lb * rx - rb * lx) * w_rec * (1.0f/255.0f);
+
+	walker->a_s = (ra - la) * w_rec;
+	walker->r_s = (rr - lr) * w_rec * (1.0f/255.0f);
+	walker->g_s = (rg - lg) * w_rec * (1.0f/255.0f);
+	walker->b_s = (rb - lb) * w_rec * (1.0f/255.0f);
     }
+   
+    walker->left_x = left_x;
+    walker->right_x = right_x;
 
     walker->need_reset = FALSE;
 }
 
-#define  PIXMAN_GRADIENT_WALKER_NEED_RESET(w, x)                         \
-    ( (w)->need_reset || (x) < (w)->left_x || (x) >= (w)->right_x)
-
-
-/* the following assumes that PIXMAN_GRADIENT_WALKER_NEED_RESET(w,x) is FALSE */
 uint32_t
 _pixman_gradient_walker_pixel (pixman_gradient_walker_t *walker,
-                               pixman_fixed_32_32_t      x)
+                               pixman_fixed_48_16_t      x)
 {
-    int dist, idist;
-    uint32_t t1, t2, a, color;
+    float a, r, g, b;
+    uint8_t a8, r8, g8, b8;
+    uint32_t v;
+    float y;
 
-    if (PIXMAN_GRADIENT_WALKER_NEED_RESET (walker, x))
-	_pixman_gradient_walker_reset (walker, x);
+    if (walker->need_reset || x < walker->left_x || x >= walker->right_x)
+        gradient_walker_reset (walker, x);
 
-    dist  = ((int)(x - walker->left_x) * walker->stepper) >> 16;
-    idist = 256 - dist;
+    y = x * (1.0f / 65536.0f);
 
-    /* combined INTERPOLATE and premultiply */
-    t1 = walker->left_rb * idist + walker->right_rb * dist;
-    t1 = (t1 >> 8) & 0xff00ff;
+    a = walker->a_s * y + walker->a_b;
+    r = a * (walker->r_s * y + walker->r_b);
+    g = a * (walker->g_s * y + walker->g_b);
+    b = a * (walker->b_s * y + walker->b_b);
 
-    t2  = walker->left_ag * idist + walker->right_ag * dist;
-    t2 &= 0xff00ff00;
+    a8 = a + 0.5f;
+    r8 = r + 0.5f;
+    g8 = g + 0.5f;
+    b8 = b + 0.5f;
 
-    color = t2 & 0xff000000;
-    a     = t2 >> 24;
+    v = ((a8 << 24) & 0xff000000) |
+        ((r8 << 16) & 0x00ff0000) |
+        ((g8 <<  8) & 0x0000ff00) |
+        ((b8 >>  0) & 0x000000ff);
 
-    t1  = t1 * a + 0x800080;
-    t1  = (t1 + ((t1 >> 8) & 0xff00ff)) >> 8;
-
-    t2  = (t2 >> 8) * a + 0x800080;
-    t2  = (t2 + ((t2 >> 8) & 0xff00ff));
-
-    return (color | (t1 & 0xff00ff) | (t2 & 0xff00));
+    return v;
 }
-
diff --git a/programs/develop/libraries/pixman/pixman-image.c b/programs/develop/libraries/pixman/pixman-image.c
index c646471930..65041b43b7 100644
--- a/programs/develop/libraries/pixman/pixman-image.c
+++ b/programs/develop/libraries/pixman/pixman-image.c
@@ -30,7 +30,50 @@
 #include <assert.h>
 
 #include "pixman-private.h"
-#include "pixman-combine32.h"
+
+static const pixman_color_t transparent_black = { 0, 0, 0, 0 };
+
+static void
+gradient_property_changed (pixman_image_t *image)
+{
+    gradient_t *gradient = &image->gradient;
+    int n = gradient->n_stops;
+    pixman_gradient_stop_t *stops = gradient->stops;
+    pixman_gradient_stop_t *begin = &(gradient->stops[-1]);
+    pixman_gradient_stop_t *end = &(gradient->stops[n]);
+
+    switch (gradient->common.repeat)
+    {
+    default:
+    case PIXMAN_REPEAT_NONE:
+	begin->x = INT32_MIN;
+	begin->color = transparent_black;
+	end->x = INT32_MAX;
+	end->color = transparent_black;
+	break;
+
+    case PIXMAN_REPEAT_NORMAL:
+	begin->x = stops[n - 1].x - pixman_fixed_1;
+	begin->color = stops[n - 1].color;
+	end->x = stops[0].x + pixman_fixed_1;
+	end->color = stops[0].color;
+	break;
+
+    case PIXMAN_REPEAT_REFLECT:
+	begin->x = - stops[0].x;
+	begin->color = stops[0].color;
+	end->x = pixman_int_to_fixed (2) - stops[n - 1].x;
+	end->color = stops[n - 1].color;
+	break;
+
+    case PIXMAN_REPEAT_PAD:
+	begin->x = INT32_MIN;
+	begin->color = stops[0].color;
+	end->x = INT32_MAX;
+	end->color = stops[n - 1].color;
+	break;
+    }
+}
 
 pixman_bool_t
 _pixman_init_gradient (gradient_t *                  gradient,
@@ -39,54 +82,100 @@ _pixman_init_gradient (gradient_t *                  gradient,
 {
     return_val_if_fail (n_stops > 0, FALSE);
 
-    gradient->stops = pixman_malloc_ab (n_stops, sizeof (pixman_gradient_stop_t));
+    /* We allocate two extra stops, one before the beginning of the stop list,
+     * and one after the end. These stops are initialized to whatever color
+     * would be used for positions outside the range of the stop list.
+     *
+     * This saves a bit of computation in the gradient walker.
+     *
+     * The pointer we store in the gradient_t struct still points to the
+     * first user-supplied struct, so when freeing, we will have to
+     * subtract one.
+     */
+    gradient->stops =
+	pixman_malloc_ab (n_stops + 2, sizeof (pixman_gradient_stop_t));
     if (!gradient->stops)
 	return FALSE;
 
+    gradient->stops += 1;
     memcpy (gradient->stops, stops, n_stops * sizeof (pixman_gradient_stop_t));
-
     gradient->n_stops = n_stops;
 
-    gradient->stop_range = 0xffff;
+    gradient->common.property_changed = gradient_property_changed;
 
     return TRUE;
 }
 
-/*
- * By default, just evaluate the image at 32bpp and expand.  Individual image
- * types can plug in a better scanline getter if they want to. For example
- * we  could produce smoother gradients by evaluating them at higher color
- * depth, but that's a project for the future.
- */
 void
-_pixman_image_get_scanline_generic_64 (pixman_image_t * image,
-                                       int              x,
-                                       int              y,
-                                       int              width,
-                                       uint32_t *       buffer,
-                                       const uint32_t * mask)
+_pixman_image_init (pixman_image_t *image)
 {
-    uint32_t *mask8 = NULL;
+    image_common_t *common = &image->common;
 
-    /* Contract the mask image, if one exists, so that the 32-bit fetch
-     * function can use it.
-     */
-    if (mask)
+    pixman_region32_init (&common->clip_region);
+
+    common->alpha_count = 0;
+    common->have_clip_region = FALSE;
+    common->clip_sources = FALSE;
+    common->transform = NULL;
+    common->repeat = PIXMAN_REPEAT_NONE;
+    common->filter = PIXMAN_FILTER_NEAREST;
+    common->filter_params = NULL;
+    common->n_filter_params = 0;
+    common->alpha_map = NULL;
+    common->component_alpha = FALSE;
+    common->ref_count = 1;
+    common->property_changed = NULL;
+    common->client_clip = FALSE;
+    common->destroy_func = NULL;
+    common->destroy_data = NULL;
+    common->dirty = TRUE;
+}
+
+pixman_bool_t
+_pixman_image_fini (pixman_image_t *image)
+{
+    image_common_t *common = (image_common_t *)image;
+
+    common->ref_count--;
+
+    if (common->ref_count == 0)
     {
-	mask8 = pixman_malloc_ab (width, sizeof(uint32_t));
-	if (!mask8)
-	    return;
+	if (image->common.destroy_func)
+	    image->common.destroy_func (image, image->common.destroy_data);
 
-	pixman_contract (mask8, (uint64_t *)mask, width);
+	pixman_region32_fini (&common->clip_region);
+
+	free (common->transform);
+	free (common->filter_params);
+
+	if (common->alpha_map)
+	    pixman_image_unref ((pixman_image_t *)common->alpha_map);
+
+	if (image->type == LINEAR ||
+	    image->type == RADIAL ||
+	    image->type == CONICAL)
+	{
+	    if (image->gradient.stops)
+	    {
+		/* See _pixman_init_gradient() for an explanation of the - 1 */
+		free (image->gradient.stops - 1);
+	    }
+
+	    /* This will trigger if someone adds a property_changed
+	     * method to the linear/radial/conical gradient overwriting
+	     * the general one.
+	     */
+	    assert (
+		image->common.property_changed == gradient_property_changed);
+	}
+
+	if (image->type == BITS && image->bits.free_me)
+	    free (image->bits.free_me);
+
+	return TRUE;
     }
 
-    /* Fetch the source image into the first half of buffer. */
-    _pixman_image_get_scanline_32 (image, x, y, width, (uint32_t*)buffer, mask8);
-
-    /* Expand from 32bpp to 64bpp in place. */
-    pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, width);
-
-    free (mask8);
+    return FALSE;
 }
 
 pixman_image_t *
@@ -95,70 +184,11 @@ _pixman_image_allocate (void)
     pixman_image_t *image = malloc (sizeof (pixman_image_t));
 
     if (image)
-    {
-	image_common_t *common = &image->common;
-
-	pixman_region32_init (&common->clip_region);
-
-	common->alpha_count = 0;
-	common->have_clip_region = FALSE;
-	common->clip_sources = FALSE;
-	common->transform = NULL;
-	common->repeat = PIXMAN_REPEAT_NONE;
-	common->filter = PIXMAN_FILTER_NEAREST;
-	common->filter_params = NULL;
-	common->n_filter_params = 0;
-	common->alpha_map = NULL;
-	common->component_alpha = FALSE;
-	common->ref_count = 1;
-	common->classify = NULL;
-	common->client_clip = FALSE;
-	common->destroy_func = NULL;
-	common->destroy_data = NULL;
-	common->dirty = TRUE;
-    }
+	_pixman_image_init (image);
 
     return image;
 }
 
-source_image_class_t
-_pixman_image_classify (pixman_image_t *image,
-                        int             x,
-                        int             y,
-                        int             width,
-                        int             height)
-{
-    if (image->common.classify)
-	return image->common.classify (image, x, y, width, height);
-    else
-	return SOURCE_IMAGE_CLASS_UNKNOWN;
-}
-
-void
-_pixman_image_get_scanline_32 (pixman_image_t *image,
-                               int             x,
-                               int             y,
-                               int             width,
-                               uint32_t *      buffer,
-                               const uint32_t *mask)
-{
-    image->common.get_scanline_32 (image, x, y, width, buffer, mask);
-}
-
-/* Even thought the type of buffer is uint32_t *, the function actually expects
- * a uint64_t *buffer.
- */
-void
-_pixman_image_get_scanline_64 (pixman_image_t *image,
-                               int             x,
-                               int             y,
-                               int             width,
-                               uint32_t *      buffer,
-                               const uint32_t *unused)
-{
-    image->common.get_scanline_64 (image, x, y, width, buffer, unused);
-}
-
 static void
 image_property_changed (pixman_image_t *image)
 {
@@ -178,39 +208,9 @@ pixman_image_ref (pixman_image_t *image)
 PIXMAN_EXPORT pixman_bool_t
 pixman_image_unref (pixman_image_t *image)
 {
-    image_common_t *common = (image_common_t *)image;
-
-    common->ref_count--;
-
-    if (common->ref_count == 0)
+    if (_pixman_image_fini (image))
     {
-	if (image->common.destroy_func)
-	    image->common.destroy_func (image, image->common.destroy_data);
-
-	pixman_region32_fini (&common->clip_region);
-
-	if (common->transform)
-	    free (common->transform);
-
-	if (common->filter_params)
-	    free (common->filter_params);
-
-	if (common->alpha_map)
-	    pixman_image_unref ((pixman_image_t *)common->alpha_map);
-
-	if (image->type == LINEAR ||
-	    image->type == RADIAL ||
-	    image->type == CONICAL)
-	{
-	    if (image->gradient.stops)
-		free (image->gradient.stops);
-	}
-
-	if (image->type == BITS && image->bits.free_me)
-	    free (image->bits.free_me);
-
 	free (image);
-
 	return TRUE;
     }
 
@@ -238,54 +238,27 @@ _pixman_image_reset_clip_region (pixman_image_t *image)
     image->common.have_clip_region = FALSE;
 }
 
-static pixman_bool_t out_of_bounds_workaround = TRUE;
-
-/* Old X servers rely on out-of-bounds accesses when they are asked
- * to composite with a window as the source. They create a pixman image
- * pointing to some bogus position in memory, but then they set a clip
- * region to the position where the actual bits are.
+/* Executive Summary: This function is a no-op that only exists
+ * for historical reasons.
+ *
+ * There used to be a bug in the X server where it would rely on
+ * out-of-bounds accesses when it was asked to composite with a
+ * window as the source. It would create a pixman image pointing
+ * to some bogus position in memory, but then set a clip region
+ * to the position where the actual bits were.
  *
  * Due to a bug in old versions of pixman, where it would not clip
  * against the image bounds when a clip region was set, this would
- * actually work. So by default we allow certain out-of-bound access
- * to happen unless explicitly disabled.
+ * actually work. So when the pixman bug was fixed, a workaround was
+ * added to allow certain out-of-bound accesses. This function disabled
+ * those workarounds.
  *
- * Fixed X servers should call this function to disable the workaround.
+ * Since 0.21.2, pixman doesn't do these workarounds anymore, so now
+ * this function is a no-op.
  */
 PIXMAN_EXPORT void
 pixman_disable_out_of_bounds_workaround (void)
 {
-    out_of_bounds_workaround = FALSE;
-}
-
-static pixman_bool_t
-source_image_needs_out_of_bounds_workaround (bits_image_t *image)
-{
-    if (image->common.clip_sources                      &&
-        image->common.repeat == PIXMAN_REPEAT_NONE      &&
-	image->common.have_clip_region			&&
-        out_of_bounds_workaround)
-    {
-	if (!image->common.client_clip)
-	{
-	    /* There is no client clip, so if the clip region extends beyond the
-	     * drawable geometry, it must be because the X server generated the
-	     * bogus clip region.
-	     */
-	    const pixman_box32_t *extents =
-		pixman_region32_extents (&image->common.clip_region);
-
-	    if (extents->x1 >= 0 && extents->x2 <= image->width &&
-		extents->y1 >= 0 && extents->y2 <= image->height)
-	    {
-		return FALSE;
-	    }
-	}
-
-	return TRUE;
-    }
-
-    return FALSE;
 }
 
 static void
@@ -315,8 +288,24 @@ compute_image_info (pixman_image_t *image)
 	    if (image->common.transform->matrix[0][1] == 0 &&
 		image->common.transform->matrix[1][0] == 0)
 	    {
+		if (image->common.transform->matrix[0][0] == -pixman_fixed_1 &&
+		    image->common.transform->matrix[1][1] == -pixman_fixed_1)
+		{
+		    flags |= FAST_PATH_ROTATE_180_TRANSFORM;
+		}
 		flags |= FAST_PATH_SCALE_TRANSFORM;
 	    }
+	    else if (image->common.transform->matrix[0][0] == 0 &&
+	             image->common.transform->matrix[1][1] == 0)
+	    {
+		pixman_fixed_t m01 = image->common.transform->matrix[0][1];
+		pixman_fixed_t m10 = image->common.transform->matrix[1][0];
+
+		if (m01 == -pixman_fixed_1 && m10 == pixman_fixed_1)
+		    flags |= FAST_PATH_ROTATE_90_TRANSFORM;
+		else if (m01 == pixman_fixed_1 && m10 == -pixman_fixed_1)
+		    flags |= FAST_PATH_ROTATE_270_TRANSFORM;
+	    }
 	}
 
 	if (image->common.transform->matrix[0][0] > 0)
@@ -338,11 +327,56 @@ compute_image_info (pixman_image_t *image)
     case PIXMAN_FILTER_GOOD:
     case PIXMAN_FILTER_BEST:
 	flags |= (FAST_PATH_BILINEAR_FILTER | FAST_PATH_NO_CONVOLUTION_FILTER);
+
+	/* Here we have a chance to optimize BILINEAR filter to NEAREST if
+	 * they are equivalent for the currently used transformation matrix.
+	 */
+	if (flags & FAST_PATH_ID_TRANSFORM)
+	{
+	    flags |= FAST_PATH_NEAREST_FILTER;
+	}
+	else if (
+	    /* affine and integer translation components in matrix ... */
+	    ((flags & FAST_PATH_AFFINE_TRANSFORM) &&
+	     !pixman_fixed_frac (image->common.transform->matrix[0][2] |
+				 image->common.transform->matrix[1][2])) &&
+	    (
+		/* ... combined with a simple rotation */
+		(flags & (FAST_PATH_ROTATE_90_TRANSFORM |
+			  FAST_PATH_ROTATE_180_TRANSFORM |
+			  FAST_PATH_ROTATE_270_TRANSFORM)) ||
+		/* ... or combined with a simple non-rotated translation */
+		(image->common.transform->matrix[0][0] == pixman_fixed_1 &&
+		 image->common.transform->matrix[1][1] == pixman_fixed_1 &&
+		 image->common.transform->matrix[0][1] == 0 &&
+		 image->common.transform->matrix[1][0] == 0)
+		)
+	    )
+	{
+	    /* FIXME: there are some affine-test failures, showing that
+	     * handling of BILINEAR and NEAREST filter is not quite
+	     * equivalent when getting close to 32K for the translation
+	     * components of the matrix. That's likely some bug, but for
+	     * now just skip BILINEAR->NEAREST optimization in this case.
+	     */
+	    pixman_fixed_t magic_limit = pixman_int_to_fixed (30000);
+	    if (image->common.transform->matrix[0][2] <= magic_limit  &&
+	        image->common.transform->matrix[1][2] <= magic_limit  &&
+	        image->common.transform->matrix[0][2] >= -magic_limit &&
+	        image->common.transform->matrix[1][2] >= -magic_limit)
+	    {
+		flags |= FAST_PATH_NEAREST_FILTER;
+	    }
+	}
 	break;
 
     case PIXMAN_FILTER_CONVOLUTION:
 	break;
 
+    case PIXMAN_FILTER_SEPARABLE_CONVOLUTION:
+	flags |= FAST_PATH_SEPARABLE_CONVOLUTION_FILTER;
+	break;
+
     default:
 	flags |= FAST_PATH_NO_CONVOLUTION_FILTER;
 	break;
@@ -408,6 +442,7 @@ compute_image_info (pixman_image_t *image)
 	else
 	{
 	    code = image->bits.format;
+	    flags |= FAST_PATH_BITS_IMAGE;
 	}
 
 	if (!PIXMAN_FORMAT_A (image->bits.format)				&&
@@ -420,9 +455,6 @@ compute_image_info (pixman_image_t *image)
 		flags |= FAST_PATH_IS_OPAQUE;
 	}
 
-	if (source_image_needs_out_of_bounds_workaround (&image->bits))
-	    flags |= FAST_PATH_NEEDS_WORKAROUND;
-
 	if (image->bits.read_func || image->bits.write_func)
 	    flags &= ~FAST_PATH_NO_ACCESSORS;
 
@@ -445,6 +477,7 @@ compute_image_info (pixman_image_t *image)
 
 	/* Fall through */
 
+    case CONICAL:
     case LINEAR:
 	code = PIXMAN_unknown;
 
@@ -486,8 +519,9 @@ compute_image_info (pixman_image_t *image)
      * if all channels are opaque, so we simply turn it off
      * unconditionally for those images.
      */
-    if (image->common.alpha_map					||
-	image->common.filter == PIXMAN_FILTER_CONVOLUTION	||
+    if (image->common.alpha_map						||
+	image->common.filter == PIXMAN_FILTER_CONVOLUTION		||
+        image->common.filter == PIXMAN_FILTER_SEPARABLE_CONVOLUTION     ||
 	image->common.component_alpha)
     {
 	flags &= ~(FAST_PATH_IS_OPAQUE | FAST_PATH_SAMPLES_OPAQUE);
@@ -509,7 +543,8 @@ _pixman_image_validate (pixman_image_t *image)
 	 * property_changed() can make use of the flags
 	 * to set up accessors etc.
 	 */
-	image->common.property_changed (image);
+	if (image->common.property_changed)
+	    image->common.property_changed (image);
 
 	image->common.dirty = FALSE;
     }
@@ -590,7 +625,7 @@ pixman_image_set_transform (pixman_image_t *          image,
     if (common->transform == transform)
 	return TRUE;
 
-    if (memcmp (&id, transform, sizeof (pixman_transform_t)) == 0)
+    if (!transform || memcmp (&id, transform, sizeof (pixman_transform_t)) == 0)
     {
 	free (common->transform);
 	common->transform = NULL;
@@ -599,6 +634,12 @@ pixman_image_set_transform (pixman_image_t *          image,
 	goto out;
     }
 
+    if (common->transform &&
+	memcmp (common->transform, transform, sizeof (pixman_transform_t)) == 0)
+    {
+	return TRUE;
+    }
+
     if (common->transform == NULL)
 	common->transform = malloc (sizeof (pixman_transform_t));
 
@@ -623,6 +664,9 @@ PIXMAN_EXPORT void
 pixman_image_set_repeat (pixman_image_t *image,
                          pixman_repeat_t repeat)
 {
+    if (image->common.repeat == repeat)
+	return;
+
     image->common.repeat = repeat;
 
     image_property_changed (image);
@@ -640,6 +684,19 @@ pixman_image_set_filter (pixman_image_t *      image,
     if (params == common->filter_params && filter == common->filter)
 	return TRUE;
 
+    if (filter == PIXMAN_FILTER_SEPARABLE_CONVOLUTION)
+    {
+	int width = pixman_fixed_to_int (params[0]);
+	int height = pixman_fixed_to_int (params[1]);
+	int x_phase_bits = pixman_fixed_to_int (params[2]);
+	int y_phase_bits = pixman_fixed_to_int (params[3]);
+	int n_x_phases = (1 << x_phase_bits);
+	int n_y_phases = (1 << y_phase_bits);
+
+	return_val_if_fail (
+	    n_params == 4 + n_x_phases * width + n_y_phases * height, FALSE);
+    }
+    
     new_params = NULL;
     if (params)
     {
@@ -667,6 +724,9 @@ PIXMAN_EXPORT void
 pixman_image_set_source_clipping (pixman_image_t *image,
                                   pixman_bool_t   clip_sources)
 {
+    if (image->common.clip_sources == clip_sources)
+	return;
+
     image->common.clip_sources = clip_sources;
 
     image_property_changed (image);
@@ -682,6 +742,9 @@ pixman_image_set_indexed (pixman_image_t *        image,
 {
     bits_image_t *bits = (bits_image_t *)image;
 
+    if (bits->indexed == indexed)
+	return;
+
     bits->indexed = indexed;
 
     image_property_changed (image);
@@ -744,6 +807,9 @@ PIXMAN_EXPORT void
 pixman_image_set_component_alpha   (pixman_image_t *image,
                                     pixman_bool_t   component_alpha)
 {
+    if (image->common.component_alpha == component_alpha)
+	return;
+
     image->common.component_alpha = component_alpha;
 
     image_property_changed (image);
@@ -822,19 +888,47 @@ pixman_image_get_format (pixman_image_t *image)
     if (image->type == BITS)
 	return image->bits.format;
 
-    return 0;
+    return PIXMAN_null;
 }
 
 uint32_t
-_pixman_image_get_solid (pixman_image_t *     image,
-                         pixman_format_code_t format)
+_pixman_image_get_solid (pixman_implementation_t *imp,
+			 pixman_image_t *         image,
+                         pixman_format_code_t     format)
 {
     uint32_t result;
 
-    _pixman_image_get_scanline_32 (image, 0, 0, 1, &result, NULL);
+    if (image->type == SOLID)
+    {
+	result = image->solid.color_32;
+    }
+    else if (image->type == BITS)
+    {
+	if (image->bits.format == PIXMAN_a8r8g8b8)
+	    result = image->bits.bits[0];
+	else if (image->bits.format == PIXMAN_x8r8g8b8)
+	    result = image->bits.bits[0] | 0xff000000;
+	else if (image->bits.format == PIXMAN_a8)
+	    result = (*(uint8_t *)image->bits.bits) << 24;
+	else
+	    goto otherwise;
+    }
+    else
+    {
+	pixman_iter_t iter;
+
+    otherwise:
+	_pixman_implementation_src_iter_init (
+	    imp, &iter, image, 0, 0, 1, 1,
+	    (uint8_t *)&result,
+	    ITER_NARROW, image->common.flags);
+	
+	result = *iter.get_scanline (&iter, NULL);
+    }
 
     /* If necessary, convert RGB <--> BGR. */
-    if (PIXMAN_FORMAT_TYPE (format) != PIXMAN_TYPE_ARGB)
+    if (PIXMAN_FORMAT_TYPE (format) != PIXMAN_TYPE_ARGB
+	&& PIXMAN_FORMAT_TYPE (format) != PIXMAN_TYPE_ARGB_SRGB)
     {
 	result = (((result & 0xff000000) >>  0) |
 	          ((result & 0x00ff0000) >> 16) |
diff --git a/programs/develop/libraries/pixman/pixman-implementation.c b/programs/develop/libraries/pixman/pixman-implementation.c
index bc3749ef59..0c97cd3a27 100644
--- a/programs/develop/libraries/pixman/pixman-implementation.c
+++ b/programs/develop/libraries/pixman/pixman-implementation.c
@@ -27,168 +27,206 @@
 #include <stdlib.h>
 #include "pixman-private.h"
 
-static void
-delegate_combine_32 (pixman_implementation_t * imp,
-                     pixman_op_t               op,
-                     uint32_t *                dest,
-                     const uint32_t *          src,
-                     const uint32_t *          mask,
-                     int                       width)
-{
-    _pixman_implementation_combine_32 (imp->delegate,
-                                       op, dest, src, mask, width);
-}
-
-static void
-delegate_combine_64 (pixman_implementation_t * imp,
-                     pixman_op_t               op,
-                     uint64_t *                dest,
-                     const uint64_t *          src,
-                     const uint64_t *          mask,
-                     int                       width)
-{
-    _pixman_implementation_combine_64 (imp->delegate,
-                                       op, dest, src, mask, width);
-}
-
-static void
-delegate_combine_32_ca (pixman_implementation_t * imp,
-                        pixman_op_t               op,
-                        uint32_t *                dest,
-                        const uint32_t *          src,
-                        const uint32_t *          mask,
-                        int                       width)
-{
-    _pixman_implementation_combine_32_ca (imp->delegate,
-                                          op, dest, src, mask, width);
-}
-
-static void
-delegate_combine_64_ca (pixman_implementation_t * imp,
-                        pixman_op_t               op,
-                        uint64_t *                dest,
-                        const uint64_t *          src,
-                        const uint64_t *          mask,
-                        int                       width)
-{
-    _pixman_implementation_combine_64_ca (imp->delegate,
-                                          op, dest, src, mask, width);
-}
-
-static pixman_bool_t
-delegate_blt (pixman_implementation_t * imp,
-              uint32_t *                src_bits,
-              uint32_t *                dst_bits,
-              int                       src_stride,
-              int                       dst_stride,
-              int                       src_bpp,
-              int                       dst_bpp,
-              int                       src_x,
-              int                       src_y,
-              int                       dst_x,
-              int                       dst_y,
-              int                       width,
-              int                       height)
-{
-    return _pixman_implementation_blt (
-	imp->delegate, src_bits, dst_bits, src_stride, dst_stride,
-	src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y,
-	width, height);
-}
-
-static pixman_bool_t
-delegate_fill (pixman_implementation_t *imp,
-               uint32_t *               bits,
-               int                      stride,
-               int                      bpp,
-               int                      x,
-               int                      y,
-               int                      width,
-               int                      height,
-               uint32_t                 xor)
-{
-    return _pixman_implementation_fill (
-	imp->delegate, bits, stride, bpp, x, y, width, height, xor);
-}
-
 pixman_implementation_t *
-_pixman_implementation_create (pixman_implementation_t *delegate,
+_pixman_implementation_create (pixman_implementation_t *fallback,
 			       const pixman_fast_path_t *fast_paths)
 {
-    pixman_implementation_t *imp = malloc (sizeof (pixman_implementation_t));
-    pixman_implementation_t *d;
-    int i;
-
-    if (!imp)
-	return NULL;
+    pixman_implementation_t *imp;
 
     assert (fast_paths);
 
-    /* Make sure the whole delegate chain has the right toplevel */
-    imp->delegate = delegate;
-    for (d = imp; d != NULL; d = d->delegate)
-	d->toplevel = imp;
-
-    /* Fill out function pointers with ones that just delegate
-     */
-    imp->blt = delegate_blt;
-    imp->fill = delegate_fill;
-
-    for (i = 0; i < PIXMAN_N_OPERATORS; ++i)
+    if ((imp = malloc (sizeof (pixman_implementation_t))))
     {
-	imp->combine_32[i] = delegate_combine_32;
-	imp->combine_64[i] = delegate_combine_64;
-	imp->combine_32_ca[i] = delegate_combine_32_ca;
-	imp->combine_64_ca[i] = delegate_combine_64_ca;
+	pixman_implementation_t *d;
+
+	memset (imp, 0, sizeof *imp);
+
+	imp->fallback = fallback;
+	imp->fast_paths = fast_paths;
+
+	/* Make sure the whole fallback chain has the right toplevel */
+	for (d = imp; d != NULL; d = d->fallback)
+	    d->toplevel = imp;
     }
 
-    imp->fast_paths = fast_paths;
-    
     return imp;
 }
 
-void
-_pixman_implementation_combine_32 (pixman_implementation_t * imp,
-                                   pixman_op_t               op,
-                                   uint32_t *                dest,
-                                   const uint32_t *          src,
-                                   const uint32_t *          mask,
-                                   int                       width)
+#define N_CACHED_FAST_PATHS 8
+
+typedef struct
+{
+    struct
+    {
+	pixman_implementation_t *	imp;
+	pixman_fast_path_t		fast_path;
+    } cache [N_CACHED_FAST_PATHS];
+} cache_t;
+
+PIXMAN_DEFINE_THREAD_LOCAL (cache_t, fast_path_cache);
+
+static void
+dummy_composite_rect (pixman_implementation_t *imp,
+		      pixman_composite_info_t *info)
 {
-    (*imp->combine_32[op]) (imp, op, dest, src, mask, width);
 }
 
 void
-_pixman_implementation_combine_64 (pixman_implementation_t * imp,
-                                   pixman_op_t               op,
-                                   uint64_t *                dest,
-                                   const uint64_t *          src,
-                                   const uint64_t *          mask,
-                                   int                       width)
+_pixman_implementation_lookup_composite (pixman_implementation_t  *toplevel,
+					 pixman_op_t               op,
+					 pixman_format_code_t      src_format,
+					 uint32_t                  src_flags,
+					 pixman_format_code_t      mask_format,
+					 uint32_t                  mask_flags,
+					 pixman_format_code_t      dest_format,
+					 uint32_t                  dest_flags,
+					 pixman_implementation_t **out_imp,
+					 pixman_composite_func_t  *out_func)
 {
-    (*imp->combine_64[op]) (imp, op, dest, src, mask, width);
+    pixman_implementation_t *imp;
+    cache_t *cache;
+    int i;
+
+    /* Check cache for fast paths */
+    cache = PIXMAN_GET_THREAD_LOCAL (fast_path_cache);
+
+    for (i = 0; i < N_CACHED_FAST_PATHS; ++i)
+    {
+	const pixman_fast_path_t *info = &(cache->cache[i].fast_path);
+
+	/* Note that we check for equality here, not whether
+	 * the cached fast path matches. This is to prevent
+	 * us from selecting an overly general fast path
+	 * when a more specific one would work.
+	 */
+	if (info->op == op			&&
+	    info->src_format == src_format	&&
+	    info->mask_format == mask_format	&&
+	    info->dest_format == dest_format	&&
+	    info->src_flags == src_flags	&&
+	    info->mask_flags == mask_flags	&&
+	    info->dest_flags == dest_flags	&&
+	    info->func)
+	{
+	    *out_imp = cache->cache[i].imp;
+	    *out_func = cache->cache[i].fast_path.func;
+
+	    goto update_cache;
+	}
+    }
+
+    for (imp = toplevel; imp != NULL; imp = imp->fallback)
+    {
+	const pixman_fast_path_t *info = imp->fast_paths;
+
+	while (info->op != PIXMAN_OP_NONE)
+	{
+	    if ((info->op == op || info->op == PIXMAN_OP_any)		&&
+		/* Formats */
+		((info->src_format == src_format) ||
+		 (info->src_format == PIXMAN_any))			&&
+		((info->mask_format == mask_format) ||
+		 (info->mask_format == PIXMAN_any))			&&
+		((info->dest_format == dest_format) ||
+		 (info->dest_format == PIXMAN_any))			&&
+		/* Flags */
+		(info->src_flags & src_flags) == info->src_flags	&&
+		(info->mask_flags & mask_flags) == info->mask_flags	&&
+		(info->dest_flags & dest_flags) == info->dest_flags)
+	    {
+		*out_imp = imp;
+		*out_func = info->func;
+
+		/* Set i to the last spot in the cache so that the
+		 * move-to-front code below will work
+		 */
+		i = N_CACHED_FAST_PATHS - 1;
+
+		goto update_cache;
+	    }
+
+	    ++info;
+	}
+    }
+
+    /* We should never reach this point */
+    _pixman_log_error (
+        FUNC,
+        "No composite function found\n"
+        "\n"
+        "The most likely cause of this is that this system has issues with\n"
+        "thread local storage\n");
+
+    *out_imp = NULL;
+    *out_func = dummy_composite_rect;
+    return;
+
+update_cache:
+    if (i)
+    {
+	while (i--)
+	    cache->cache[i + 1] = cache->cache[i];
+
+	cache->cache[0].imp = *out_imp;
+	cache->cache[0].fast_path.op = op;
+	cache->cache[0].fast_path.src_format = src_format;
+	cache->cache[0].fast_path.src_flags = src_flags;
+	cache->cache[0].fast_path.mask_format = mask_format;
+	cache->cache[0].fast_path.mask_flags = mask_flags;
+	cache->cache[0].fast_path.dest_format = dest_format;
+	cache->cache[0].fast_path.dest_flags = dest_flags;
+	cache->cache[0].fast_path.func = *out_func;
+    }
 }
 
-void
-_pixman_implementation_combine_32_ca (pixman_implementation_t * imp,
-                                      pixman_op_t               op,
-                                      uint32_t *                dest,
-                                      const uint32_t *          src,
-                                      const uint32_t *          mask,
-                                      int                       width)
+static void
+dummy_combine (pixman_implementation_t *imp,
+	       pixman_op_t              op,
+	       uint32_t *               pd,
+	       const uint32_t *         ps,
+	       const uint32_t *         pm,
+	       int                      w)
 {
-    (*imp->combine_32_ca[op]) (imp, op, dest, src, mask, width);
 }
 
-void
-_pixman_implementation_combine_64_ca (pixman_implementation_t * imp,
-                                      pixman_op_t               op,
-                                      uint64_t *                dest,
-                                      const uint64_t *          src,
-                                      const uint64_t *          mask,
-                                      int                       width)
+pixman_combine_32_func_t
+_pixman_implementation_lookup_combiner (pixman_implementation_t *imp,
+					pixman_op_t		 op,
+					pixman_bool_t		 component_alpha,
+					pixman_bool_t		 narrow)
 {
-    (*imp->combine_64_ca[op]) (imp, op, dest, src, mask, width);
+    while (imp)
+    {
+	pixman_combine_32_func_t f = NULL;
+
+	switch ((narrow << 1) | component_alpha)
+	{
+	case 0: /* not narrow, not component alpha */
+	    f = (pixman_combine_32_func_t)imp->combine_float[op];
+	    break;
+
+	case 1: /* not narrow, component_alpha */
+	    f = (pixman_combine_32_func_t)imp->combine_float_ca[op];
+	    break;
+
+	case 2: /* narrow, not component alpha */
+	    f = imp->combine_32[op];
+	    break;
+
+	case 3: /* narrow, component_alpha */
+	    f = imp->combine_32_ca[op];
+	    break;
+	}
+
+	if (f)
+	    return f;
+
+	imp = imp->fallback;
+    }
+
+    /* We should never reach this point */
+    _pixman_log_error (FUNC, "No known combine function\n");
+    return dummy_combine;
 }
 
 pixman_bool_t
@@ -201,14 +239,25 @@ _pixman_implementation_blt (pixman_implementation_t * imp,
                             int                       dst_bpp,
                             int                       src_x,
                             int                       src_y,
-                            int                       dst_x,
-                            int                       dst_y,
+                            int                       dest_x,
+                            int                       dest_y,
                             int                       width,
                             int                       height)
 {
-    return (*imp->blt) (imp, src_bits, dst_bits, src_stride, dst_stride,
-			src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y,
-			width, height);
+    while (imp)
+    {
+	if (imp->blt &&
+	    (*imp->blt) (imp, src_bits, dst_bits, src_stride, dst_stride,
+			 src_bpp, dst_bpp, src_x, src_y, dest_x, dest_y,
+			 width, height))
+	{
+	    return TRUE;
+	}
+
+	imp = imp->fallback;
+    }
+
+    return FALSE;
 }
 
 pixman_bool_t
@@ -220,8 +269,130 @@ _pixman_implementation_fill (pixman_implementation_t *imp,
                              int                      y,
                              int                      width,
                              int                      height,
-                             uint32_t                 xor)
+                             uint32_t                 filler)
 {
-    return (*imp->fill) (imp, bits, stride, bpp, x, y, width, height, xor);
+    while (imp)
+    {
+	if (imp->fill &&
+	    ((*imp->fill) (imp, bits, stride, bpp, x, y, width, height, filler)))
+	{
+	    return TRUE;
+	}
+
+	imp = imp->fallback;
+    }
+
+    return FALSE;
 }
 
+pixman_bool_t
+_pixman_implementation_src_iter_init (pixman_implementation_t	*imp,
+				      pixman_iter_t             *iter,
+				      pixman_image_t		*image,
+				      int			 x,
+				      int			 y,
+				      int			 width,
+				      int			 height,
+				      uint8_t			*buffer,
+				      iter_flags_t		 iter_flags,
+				      uint32_t                   image_flags)
+{
+    iter->image = image;
+    iter->buffer = (uint32_t *)buffer;
+    iter->x = x;
+    iter->y = y;
+    iter->width = width;
+    iter->height = height;
+    iter->iter_flags = iter_flags;
+    iter->image_flags = image_flags;
+
+    while (imp)
+    {
+	if (imp->src_iter_init && (*imp->src_iter_init) (imp, iter))
+	    return TRUE;
+
+	imp = imp->fallback;
+    }
+
+    return FALSE;
+}
+
+pixman_bool_t
+_pixman_implementation_dest_iter_init (pixman_implementation_t	*imp,
+				       pixman_iter_t            *iter,
+				       pixman_image_t		*image,
+				       int			 x,
+				       int			 y,
+				       int			 width,
+				       int			 height,
+				       uint8_t			*buffer,
+				       iter_flags_t		 iter_flags,
+				       uint32_t                  image_flags)
+{
+    iter->image = image;
+    iter->buffer = (uint32_t *)buffer;
+    iter->x = x;
+    iter->y = y;
+    iter->width = width;
+    iter->height = height;
+    iter->iter_flags = iter_flags;
+    iter->image_flags = image_flags;
+
+    while (imp)
+    {
+	if (imp->dest_iter_init && (*imp->dest_iter_init) (imp, iter))
+	    return TRUE;
+
+	imp = imp->fallback;
+    }
+
+    return FALSE;
+}
+
+pixman_bool_t
+_pixman_disabled (const char *name)
+{
+    const char *env;
+
+    if ((env = getenv ("PIXMAN_DISABLE")))
+    {
+	do
+	{
+	    const char *end;
+	    int len;
+
+	    if ((end = strchr (env, ' ')))
+		len = end - env;
+	    else
+		len = strlen (env);
+
+	    if (strlen (name) == len && strncmp (name, env, len) == 0)
+	    {
+		printf ("pixman: Disabled %s implementation\n", name);
+		return TRUE;
+	    }
+
+	    env += len;
+	}
+	while (*env++);
+    }
+
+    return FALSE;
+}
+
+pixman_implementation_t *
+_pixman_choose_implementation (void)
+{
+    pixman_implementation_t *imp;
+
+    imp = _pixman_implementation_create_general();
+
+    if (!_pixman_disabled ("fast"))
+        imp = _pixman_implementation_create_fast_path (imp);
+
+    imp = _pixman_x86_get_implementations (imp);
+
+    imp = _pixman_implementation_create_noop (imp);
+
+    return imp;
+}
diff --git a/programs/develop/libraries/pixman/pixman-inlines.h b/programs/develop/libraries/pixman/pixman-inlines.h
new file mode 100644
index 0000000000..dd1c2f17f0
--- /dev/null
+++ b/programs/develop/libraries/pixman/pixman-inlines.h
@@ -0,0 +1,1339 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author:  Keith Packard, SuSE, Inc.
+ */
+
+#ifndef PIXMAN_FAST_PATH_H__
+#define PIXMAN_FAST_PATH_H__
+
+#include "pixman-private.h"
+
+#define PIXMAN_REPEAT_COVER -1
+
+/* Flags describing input parameters to fast path macro template.
+ * Turning on some flag values may indicate that
+ * "some property X is available so template can use this" or
+ * "some property X should be handled by template".
+ *
+ * FLAG_HAVE_SOLID_MASK
+ *  Input mask is solid so template should handle this.
+ *
+ * FLAG_HAVE_NON_SOLID_MASK
+ *  Input mask is bits mask so template should handle this.
+ *
+ * FLAG_HAVE_SOLID_MASK and FLAG_HAVE_NON_SOLID_MASK are mutually
+ * exclusive. (It's not allowed to turn both flags on)
+ */
+#define FLAG_NONE				(0)
+#define FLAG_HAVE_SOLID_MASK			(1 <<   1)
+#define FLAG_HAVE_NON_SOLID_MASK		(1 <<   2)
+
+/* To avoid too short repeated scanline function calls, extend source
+ * scanlines having width less than below constant value.
+ */
+#define REPEAT_NORMAL_MIN_WIDTH			64
+
+static force_inline pixman_bool_t
+repeat (pixman_repeat_t repeat, int *c, int size)
+{
+    if (repeat == PIXMAN_REPEAT_NONE)
+    {
+	if (*c < 0 || *c >= size)
+	    return FALSE;
+    }
+    else if (repeat == PIXMAN_REPEAT_NORMAL)
+    {
+	while (*c >= size)
+	    *c -= size;
+	while (*c < 0)
+	    *c += size;
+    }
+    else if (repeat == PIXMAN_REPEAT_PAD)
+    {
+	*c = CLIP (*c, 0, size - 1);
+    }
+    else /* REFLECT */
+    {
+	*c = MOD (*c, size * 2);
+	if (*c >= size)
+	    *c = size * 2 - *c - 1;
+    }
+    return TRUE;
+}
+
+static force_inline int
+pixman_fixed_to_bilinear_weight (pixman_fixed_t x)
+{
+    return (x >> (16 - BILINEAR_INTERPOLATION_BITS)) &
+	   ((1 << BILINEAR_INTERPOLATION_BITS) - 1);
+}
+
+#if BILINEAR_INTERPOLATION_BITS <= 4
+/* Inspired by Filter_32_opaque from Skia */
+static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+			uint32_t bl, uint32_t br,
+			int distx, int disty)
+{
+    int distxy, distxiy, distixy, distixiy;
+    uint32_t lo, hi;
+
+    distx <<= (4 - BILINEAR_INTERPOLATION_BITS);
+    disty <<= (4 - BILINEAR_INTERPOLATION_BITS);
+
+    distxy = distx * disty;
+    distxiy = (distx << 4) - distxy;	/* distx * (16 - disty) */
+    distixy = (disty << 4) - distxy;	/* disty * (16 - distx) */
+    distixiy =
+	16 * 16 - (disty << 4) -
+	(distx << 4) + distxy; /* (16 - distx) * (16 - disty) */
+
+    lo = (tl & 0xff00ff) * distixiy;
+    hi = ((tl >> 8) & 0xff00ff) * distixiy;
+
+    lo += (tr & 0xff00ff) * distxiy;
+    hi += ((tr >> 8) & 0xff00ff) * distxiy;
+
+    lo += (bl & 0xff00ff) * distixy;
+    hi += ((bl >> 8) & 0xff00ff) * distixy;
+
+    lo += (br & 0xff00ff) * distxy;
+    hi += ((br >> 8) & 0xff00ff) * distxy;
+
+    return ((lo >> 8) & 0xff00ff) | (hi & ~0xff00ff);
+}
+
+#else
+#if SIZEOF_LONG > 4
+
+static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+			uint32_t bl, uint32_t br,
+			int distx, int disty)
+{
+    uint64_t distxy, distxiy, distixy, distixiy;
+    uint64_t tl64, tr64, bl64, br64;
+    uint64_t f, r;
+
+    distx <<= (8 - BILINEAR_INTERPOLATION_BITS);
+    disty <<= (8 - BILINEAR_INTERPOLATION_BITS);
+
+    distxy = distx * disty;
+    distxiy = distx * (256 - disty);
+    distixy = (256 - distx) * disty;
+    distixiy = (256 - distx) * (256 - disty);
+
+    /* Alpha and Blue */
+    tl64 = tl & 0xff0000ff;
+    tr64 = tr & 0xff0000ff;
+    bl64 = bl & 0xff0000ff;
+    br64 = br & 0xff0000ff;
+
+    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+    r = f & 0x0000ff0000ff0000ull;
+
+    /* Red and Green */
+    tl64 = tl;
+    tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
+
+    tr64 = tr;
+    tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
+
+    bl64 = bl;
+    bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
+
+    br64 = br;
+    br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
+
+    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+    r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
+
+    return (uint32_t)(r >> 16);
+}
+
+#else
+
+static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+			uint32_t bl, uint32_t br,
+			int distx, int disty)
+{
+    int distxy, distxiy, distixy, distixiy;
+    uint32_t f, r;
+
+    distx <<= (8 - BILINEAR_INTERPOLATION_BITS);
+    disty <<= (8 - BILINEAR_INTERPOLATION_BITS);
+
+    distxy = distx * disty;
+    distxiy = (distx << 8) - distxy;	/* distx * (256 - disty) */
+    distixy = (disty << 8) - distxy;	/* disty * (256 - distx) */
+    distixiy =
+	256 * 256 - (disty << 8) -
+	(distx << 8) + distxy;		/* (256 - distx) * (256 - disty) */
+
+    /* Blue */
+    r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
+      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
+
+    /* Green */
+    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
+      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
+    r |= f & 0xff000000;
+
+    tl >>= 16;
+    tr >>= 16;
+    bl >>= 16;
+    br >>= 16;
+    r >>= 16;
+
+    /* Red */
+    f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
+      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
+    r |= f & 0x00ff0000;
+
+    /* Alpha */
+    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
+      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
+    r |= f & 0xff000000;
+
+    return r;
+}
+
+#endif
+#endif // BILINEAR_INTERPOLATION_BITS <= 4
+
+/*
+ * For each scanline fetched from source image with PAD repeat:
+ * - calculate how many pixels need to be padded on the left side
+ * - calculate how many pixels need to be padded on the right side
+ * - update width to only count pixels which are fetched from the image
+ * All this information is returned via 'width', 'left_pad', 'right_pad'
+ * arguments. The code is assuming that 'unit_x' is positive.
+ *
+ * Note: 64-bit math is used in order to avoid potential overflows, which
+ *       is probably excessive in many cases. This particular function
+ *       may need its own correctness test and performance tuning.
+ */
+static force_inline void
+pad_repeat_get_scanline_bounds (int32_t         source_image_width,
+				pixman_fixed_t  vx,
+				pixman_fixed_t  unit_x,
+				int32_t *       width,
+				int32_t *       left_pad,
+				int32_t *       right_pad)
+{
+    int64_t max_vx = (int64_t) source_image_width << 16;
+    int64_t tmp;
+    if (vx < 0)
+    {
+	tmp = ((int64_t) unit_x - 1 - vx) / unit_x;
+	if (tmp > *width)
+	{
+	    *left_pad = *width;
+	    *width = 0;
+	}
+	else
+	{
+	    *left_pad = (int32_t) tmp;
+	    *width -= (int32_t) tmp;
+	}
+    }
+    else
+    {
+	*left_pad = 0;
+    }
+    tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad;
+    if (tmp < 0)
+    {
+	*right_pad = *width;
+	*width = 0;
+    }
+    else if (tmp >= *width)
+    {
+	*right_pad = 0;
+    }
+    else
+    {
+	*right_pad = *width - (int32_t) tmp;
+	*width = (int32_t) tmp;
+    }
+}
+
+/* A macroified version of specialized nearest scalers for some
+ * common 8888 and 565 formats. It supports SRC and OVER ops.
+ *
+ * There are two repeat versions, one that handles repeat normal,
+ * and one without repeat handling that only works if the src region
+ * used is completely covered by the pre-repeated source samples.
+ *
+ * The loops are unrolled to process two pixels per iteration for better
+ * performance on most CPU architectures (superscalar processors
+ * can issue several operations simultaneously, other processors can hide
+ * instructions latencies by pipelining operations). Unrolling more
+ * does not make much sense because the compiler will start running out
+ * of spare registers soon.
+ */
+
+#define GET_8888_ALPHA(s) ((s) >> 24)
+ /* This is not actually used since we don't have an OVER with
+    565 source, but it is needed to build. */
+#define GET_0565_ALPHA(s) 0xff
+#define GET_x888_ALPHA(s) 0xff
+
+#define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT,			\
+			      src_type_t, dst_type_t, OP, repeat_mode)				\
+static force_inline void									\
+scanline_func_name (dst_type_t       *dst,							\
+		    const src_type_t *src,							\
+		    int32_t           w,							\
+		    pixman_fixed_t    vx,							\
+		    pixman_fixed_t    unit_x,							\
+		    pixman_fixed_t    src_width_fixed,						\
+		    pixman_bool_t     fully_transparent_src)					\
+{												\
+	uint32_t   d;										\
+	src_type_t s1, s2;									\
+	uint8_t    a1, a2;									\
+	int        x1, x2;									\
+												\
+	if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER && fully_transparent_src)			\
+	    return;										\
+												\
+	if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER)		\
+	    abort();										\
+												\
+	while ((w -= 2) >= 0)									\
+	{											\
+	    x1 = pixman_fixed_to_int (vx);							\
+	    vx += unit_x;									\
+	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	    {											\
+		/* This works because we know that unit_x is positive */			\
+		while (vx >= 0)									\
+		    vx -= src_width_fixed;							\
+	    }											\
+	    s1 = *(src + x1);									\
+												\
+	    x2 = pixman_fixed_to_int (vx);							\
+	    vx += unit_x;									\
+	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	    {											\
+		/* This works because we know that unit_x is positive */			\
+		while (vx >= 0)									\
+		    vx -= src_width_fixed;							\
+	    }											\
+	    s2 = *(src + x2);									\
+												\
+	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
+	    {											\
+		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
+		a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2);						\
+												\
+		if (a1 == 0xff)									\
+		{										\
+		    *dst = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\
+		}										\
+		else if (s1)									\
+		{										\
+		    d = convert_ ## DST_FORMAT ## _to_8888 (*dst);				\
+		    s1 = convert_ ## SRC_FORMAT ## _to_8888 (s1);				\
+		    a1 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
+		    *dst = convert_8888_to_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+												\
+		if (a2 == 0xff)									\
+		{										\
+		    *dst = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s2);			\
+		}										\
+		else if (s2)									\
+		{										\
+		    d = convert_## DST_FORMAT ## _to_8888 (*dst);				\
+		    s2 = convert_## SRC_FORMAT ## _to_8888 (s2);				\
+		    a2 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2);					\
+		    *dst = convert_8888_to_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+	    }											\
+	    else /* PIXMAN_OP_SRC */								\
+	    {											\
+		*dst++ = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\
+		*dst++ = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s2);			\
+	    }											\
+	}											\
+												\
+	if (w & 1)										\
+	{											\
+	    x1 = pixman_fixed_to_int (vx);							\
+	    s1 = *(src + x1);									\
+												\
+	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
+	    {											\
+		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
+												\
+		if (a1 == 0xff)									\
+		{										\
+		    *dst = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\
+		}										\
+		else if (s1)									\
+		{										\
+		    d = convert_## DST_FORMAT ## _to_8888 (*dst);				\
+		    s1 = convert_ ## SRC_FORMAT ## _to_8888 (s1);				\
+		    a1 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
+		    *dst = convert_8888_to_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+	    }											\
+	    else /* PIXMAN_OP_SRC */								\
+	    {											\
+		*dst++ = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\
+	    }											\
+	}											\
+}
+
+#define FAST_NEAREST_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
+static void											\
+fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,		\
+						   pixman_composite_info_t *info)               \
+{												\
+    PIXMAN_COMPOSITE_ARGS (info);					                        \
+    dst_type_t *dst_line;						                        \
+    mask_type_t *mask_line;									\
+    src_type_t *src_first_line;									\
+    int       y;										\
+    pixman_fixed_t src_width_fixed = pixman_int_to_fixed (src_image->bits.width);		\
+    pixman_fixed_t max_vy;									\
+    pixman_vector_t v;										\
+    pixman_fixed_t vx, vy;									\
+    pixman_fixed_t unit_x, unit_y;								\
+    int32_t left_pad, right_pad;								\
+												\
+    src_type_t *src;										\
+    dst_type_t *dst;										\
+    mask_type_t solid_mask;									\
+    const mask_type_t *mask = &solid_mask;							\
+    int src_stride, mask_stride, dst_stride;							\
+												\
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);	\
+    if (have_mask)										\
+    {												\
+	if (mask_is_solid)									\
+	    solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);	\
+	else											\
+	    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,			\
+				   mask_stride, mask_line, 1);					\
+    }												\
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
+     * transformed from destination space to source space */					\
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
+												\
+    /* reference point is the center of the pixel */						\
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
+    v.vector[2] = pixman_fixed_1;								\
+												\
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
+	return;											\
+												\
+    unit_x = src_image->common.transform->matrix[0][0];						\
+    unit_y = src_image->common.transform->matrix[1][1];						\
+												\
+    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */			\
+    v.vector[0] -= pixman_fixed_e;								\
+    v.vector[1] -= pixman_fixed_e;								\
+												\
+    vx = v.vector[0];										\
+    vy = v.vector[1];										\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
+    {												\
+	max_vy = pixman_int_to_fixed (src_image->bits.height);					\
+												\
+	/* Clamp repeating positions inside the actual samples */				\
+	repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);					\
+	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
+    }												\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
+	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
+    {												\
+	pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x,			\
+					&width, &left_pad, &right_pad);				\
+	vx += left_pad * unit_x;								\
+    }												\
+												\
+    while (--height >= 0)									\
+    {												\
+	dst = dst_line;										\
+	dst_line += dst_stride;									\
+	if (have_mask && !mask_is_solid)							\
+	{											\
+	    mask = mask_line;									\
+	    mask_line += mask_stride;								\
+	}											\
+												\
+	y = pixman_fixed_to_int (vy);								\
+	vy += unit_y;										\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
+	{											\
+	    repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height);				\
+	    src = src_first_line + src_stride * y;						\
+	    if (left_pad > 0)									\
+	    {											\
+		scanline_func (mask, dst,							\
+			       src + src_image->bits.width - src_image->bits.width + 1,		\
+			       left_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE);		\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
+			       dst + left_pad, src + src_image->bits.width, width,		\
+			       vx - src_width_fixed, unit_x, src_width_fixed, FALSE);		\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
+			       dst + left_pad + width, src + src_image->bits.width,		\
+			       right_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE);		\
+	    }											\
+	}											\
+	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
+	{											\
+	    static const src_type_t zero[1] = { 0 };						\
+	    if (y < 0 || y >= src_image->bits.height)						\
+	    {											\
+		scanline_func (mask, dst, zero + 1, left_pad + width + right_pad,		\
+			       -pixman_fixed_e, 0, src_width_fixed, TRUE);			\
+		continue;									\
+	    }											\
+	    src = src_first_line + src_stride * y;						\
+	    if (left_pad > 0)									\
+	    {											\
+		scanline_func (mask, dst, zero + 1, left_pad,					\
+			       -pixman_fixed_e, 0, src_width_fixed, TRUE);			\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
+			       dst + left_pad, src + src_image->bits.width, width,		\
+			       vx - src_width_fixed, unit_x, src_width_fixed, FALSE);		\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
+			       dst + left_pad + width, zero + 1, right_pad,			\
+			       -pixman_fixed_e, 0, src_width_fixed, TRUE);			\
+	    }											\
+	}											\
+	else											\
+	{											\
+	    src = src_first_line + src_stride * y;						\
+	    scanline_func (mask, dst, src + src_image->bits.width, width, vx - src_width_fixed,	\
+			   unit_x, src_width_fixed, FALSE);					\
+	}											\
+    }												\
+}
+
+/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
+#define FAST_NEAREST_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
+	FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, have_mask, mask_is_solid)
+
+#define FAST_NEAREST_MAINLOOP_NOMASK(scale_func_name, scanline_func, src_type_t, dst_type_t,	\
+			      repeat_mode)							\
+    static force_inline void									\
+    scanline_func##scale_func_name##_wrapper (							\
+		    const uint8_t    *mask,							\
+		    dst_type_t       *dst,							\
+		    const src_type_t *src,							\
+		    int32_t          w,								\
+		    pixman_fixed_t   vx,							\
+		    pixman_fixed_t   unit_x,							\
+		    pixman_fixed_t   max_vx,							\
+		    pixman_bool_t    fully_transparent_src)					\
+    {												\
+	scanline_func (dst, src, w, vx, unit_x, max_vx, fully_transparent_src);			\
+    }												\
+    FAST_NEAREST_MAINLOOP_INT (scale_func_name, scanline_func##scale_func_name##_wrapper,	\
+			       src_type_t, uint8_t, dst_type_t, repeat_mode, FALSE, FALSE)
+
+#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t,		\
+			      repeat_mode)							\
+	FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name, scanline_func, src_type_t,		\
+			      dst_type_t, repeat_mode)
+
+#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT,				\
+		     src_type_t, dst_type_t, OP, repeat_mode)				\
+    FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
+			  SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t,		\
+			  OP, repeat_mode)						\
+    FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name ## _ ## OP,			\
+			  scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
+			  src_type_t, dst_type_t, repeat_mode)
+
+
+#define SCALED_NEAREST_FLAGS						\
+    (FAST_PATH_SCALE_TRANSFORM	|					\
+     FAST_PATH_NO_ALPHA_MAP	|					\
+     FAST_PATH_NEAREST_FILTER	|					\
+     FAST_PATH_NO_ACCESSORS	|					\
+     FAST_PATH_NARROW_FORMAT)
+
+#define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,    \
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,	\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,	\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
+    }
+
+/* Prefer the use of 'cover' variant, because it is faster */
+#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\
+    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),			\
+    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),			\
+    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),				\
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)			\
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),		\
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),		\
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH(op,s,d,func)		\
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD (op,s,d,func)
+
+/*****************************************************************************/
+
+/*
+ * Identify 5 zones in each scanline for bilinear scaling. Depending on
+ * whether 2 pixels to be interpolated are fetched from the image itself,
+ * from the padding area around it or from both image and padding area.
+ */
+static force_inline void
+bilinear_pad_repeat_get_scanline_bounds (int32_t         source_image_width,
+					 pixman_fixed_t  vx,
+					 pixman_fixed_t  unit_x,
+					 int32_t *       left_pad,
+					 int32_t *       left_tz,
+					 int32_t *       width,
+					 int32_t *       right_tz,
+					 int32_t *       right_pad)
+{
+	int width1 = *width, left_pad1, right_pad1;
+	int width2 = *width, left_pad2, right_pad2;
+
+	pad_repeat_get_scanline_bounds (source_image_width, vx, unit_x,
+					&width1, &left_pad1, &right_pad1);
+	pad_repeat_get_scanline_bounds (source_image_width, vx + pixman_fixed_1,
+					unit_x, &width2, &left_pad2, &right_pad2);
+
+	*left_pad = left_pad2;
+	*left_tz = left_pad1 - left_pad2;
+	*right_tz = right_pad2 - right_pad1;
+	*right_pad = right_pad1;
+	*width -= *left_pad + *left_tz + *right_tz + *right_pad;
+}
+
+/*
+ * Main loop template for single pass bilinear scaling. It needs to be
+ * provided with 'scanline_func' which should do the compositing operation.
+ * The needed function has the following prototype:
+ *
+ *	scanline_func (dst_type_t *       dst,
+ *		       const mask_type_ * mask,
+ *		       const src_type_t * src_top,
+ *		       const src_type_t * src_bottom,
+ *		       int32_t            width,
+ *		       int                weight_top,
+ *		       int                weight_bottom,
+ *		       pixman_fixed_t     vx,
+ *		       pixman_fixed_t     unit_x,
+ *		       pixman_fixed_t     max_vx,
+ *		       pixman_bool_t      zero_src)
+ *
+ * Where:
+ *  dst                 - destination scanline buffer for storing results
+ *  mask                - mask buffer (or single value for solid mask)
+ *  src_top, src_bottom - two source scanlines
+ *  width               - number of pixels to process
+ *  weight_top          - weight of the top row for interpolation
+ *  weight_bottom       - weight of the bottom row for interpolation
+ *  vx                  - initial position for fetching the first pair of
+ *                        pixels from the source buffer
+ *  unit_x              - position increment needed to move to the next pair
+ *                        of pixels
+ *  max_vx              - image size as a fixed point value, can be used for
+ *                        implementing NORMAL repeat (when it is supported)
+ *  zero_src            - boolean hint variable, which is set to TRUE when
+ *                        all source pixels are fetched from zero padding
+ *                        zone for NONE repeat
+ *
+ * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to
+ *       BILINEAR_INTERPOLATION_RANGE, but sometimes it may be less than that
+ *       for NONE repeat when handling fuzzy antialiased top or bottom image
+ *       edges. Also both top and bottom weight variables are guaranteed to
+ *       have value, which is less than BILINEAR_INTERPOLATION_RANGE.
+ *       For example, the weights can fit into unsigned byte or be used
+ *       with 8-bit SIMD multiplication instructions for 8-bit interpolation
+ *       precision.
+ */
+#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, flags)				\
+static void											\
+fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp,		\
+						   pixman_composite_info_t *info)		\
+{												\
+    PIXMAN_COMPOSITE_ARGS (info);								\
+    dst_type_t *dst_line;									\
+    mask_type_t *mask_line;									\
+    src_type_t *src_first_line;									\
+    int       y1, y2;										\
+    pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */		\
+    pixman_vector_t v;										\
+    pixman_fixed_t vx, vy;									\
+    pixman_fixed_t unit_x, unit_y;								\
+    int32_t left_pad, left_tz, right_tz, right_pad;						\
+												\
+    dst_type_t *dst;										\
+    mask_type_t solid_mask;									\
+    const mask_type_t *mask = &solid_mask;							\
+    int src_stride, mask_stride, dst_stride;							\
+												\
+    int src_width;										\
+    pixman_fixed_t src_width_fixed;								\
+    int max_x;											\
+    pixman_bool_t need_src_extension;								\
+												\
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);	\
+    if (flags & FLAG_HAVE_SOLID_MASK)								\
+    {												\
+	solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);	\
+	mask_stride = 0;									\
+    }												\
+    else if (flags & FLAG_HAVE_NON_SOLID_MASK)							\
+    {												\
+	PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,				\
+			       mask_stride, mask_line, 1);					\
+    }												\
+												\
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
+     * transformed from destination space to source space */					\
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
+												\
+    /* reference point is the center of the pixel */						\
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
+    v.vector[2] = pixman_fixed_1;								\
+												\
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
+	return;											\
+												\
+    unit_x = src_image->common.transform->matrix[0][0];						\
+    unit_y = src_image->common.transform->matrix[1][1];						\
+												\
+    v.vector[0] -= pixman_fixed_1 / 2;								\
+    v.vector[1] -= pixman_fixed_1 / 2;								\
+												\
+    vy = v.vector[1];										\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
+	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
+    {												\
+	bilinear_pad_repeat_get_scanline_bounds (src_image->bits.width, v.vector[0], unit_x,	\
+					&left_pad, &left_tz, &width, &right_tz, &right_pad);	\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
+	{											\
+	    /* PAD repeat does not need special handling for 'transition zones' and */		\
+	    /* they can be combined with 'padding zones' safely */				\
+	    left_pad += left_tz;								\
+	    right_pad += right_tz;								\
+	    left_tz = right_tz = 0;								\
+	}											\
+	v.vector[0] += left_pad * unit_x;							\
+    }												\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
+    {												\
+	vx = v.vector[0];									\
+	repeat (PIXMAN_REPEAT_NORMAL, &vx, pixman_int_to_fixed(src_image->bits.width));		\
+	max_x = pixman_fixed_to_int (vx + (width - 1) * (int64_t)unit_x) + 1;			\
+												\
+	if (src_image->bits.width < REPEAT_NORMAL_MIN_WIDTH)					\
+	{											\
+	    src_width = 0;									\
+												\
+	    while (src_width < REPEAT_NORMAL_MIN_WIDTH && src_width <= max_x)			\
+		src_width += src_image->bits.width;						\
+												\
+	    need_src_extension = TRUE;								\
+	}											\
+	else											\
+	{											\
+	    src_width = src_image->bits.width;							\
+	    need_src_extension = FALSE;								\
+	}											\
+												\
+	src_width_fixed = pixman_int_to_fixed (src_width);					\
+    }												\
+												\
+    while (--height >= 0)									\
+    {												\
+	int weight1, weight2;									\
+	dst = dst_line;										\
+	dst_line += dst_stride;									\
+	vx = v.vector[0];									\
+	if (flags & FLAG_HAVE_NON_SOLID_MASK)							\
+	{											\
+	    mask = mask_line;									\
+	    mask_line += mask_stride;								\
+	}											\
+												\
+	y1 = pixman_fixed_to_int (vy);								\
+	weight2 = pixman_fixed_to_bilinear_weight (vy);						\
+	if (weight2)										\
+	{											\
+	    /* both weight1 and weight2 are smaller than BILINEAR_INTERPOLATION_RANGE */	\
+	    y2 = y1 + 1;									\
+	    weight1 = BILINEAR_INTERPOLATION_RANGE - weight2;					\
+	}											\
+	else											\
+	{											\
+	    /* set both top and bottom row to the same scanline and tweak weights */		\
+	    y2 = y1;										\
+	    weight1 = weight2 = BILINEAR_INTERPOLATION_RANGE / 2;				\
+	}											\
+	vy += unit_y;										\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
+	{											\
+	    src_type_t *src1, *src2;								\
+	    src_type_t buf1[2];									\
+	    src_type_t buf2[2];									\
+	    repeat (PIXMAN_REPEAT_PAD, &y1, src_image->bits.height);				\
+	    repeat (PIXMAN_REPEAT_PAD, &y2, src_image->bits.height);				\
+	    src1 = src_first_line + src_stride * y1;						\
+	    src2 = src_first_line + src_stride * y2;						\
+												\
+	    if (left_pad > 0)									\
+	    {											\
+		buf1[0] = buf1[1] = src1[0];							\
+		buf2[0] = buf2[1] = src2[0];							\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, FALSE);		\
+		dst += left_pad;								\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += left_pad;								\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (dst, mask,							\
+			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
+		dst += width;									\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += width;								\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		buf1[0] = buf1[1] = src1[src_image->bits.width - 1];				\
+		buf2[0] = buf2[1] = src2[src_image->bits.width - 1];				\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, FALSE);	\
+	    }											\
+	}											\
+	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
+	{											\
+	    src_type_t *src1, *src2;								\
+	    src_type_t buf1[2];									\
+	    src_type_t buf2[2];									\
+	    /* handle top/bottom zero padding by just setting weights to 0 if needed */		\
+	    if (y1 < 0)										\
+	    {											\
+		weight1 = 0;									\
+		y1 = 0;										\
+	    }											\
+	    if (y1 >= src_image->bits.height)							\
+	    {											\
+		weight1 = 0;									\
+		y1 = src_image->bits.height - 1;						\
+	    }											\
+	    if (y2 < 0)										\
+	    {											\
+		weight2 = 0;									\
+		y2 = 0;										\
+	    }											\
+	    if (y2 >= src_image->bits.height)							\
+	    {											\
+		weight2 = 0;									\
+		y2 = src_image->bits.height - 1;						\
+	    }											\
+	    src1 = src_first_line + src_stride * y1;						\
+	    src2 = src_first_line + src_stride * y2;						\
+												\
+	    if (left_pad > 0)									\
+	    {											\
+		buf1[0] = buf1[1] = 0;								\
+		buf2[0] = buf2[1] = 0;								\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, TRUE);		\
+		dst += left_pad;								\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += left_pad;								\
+	    }											\
+	    if (left_tz > 0)									\
+	    {											\
+		buf1[0] = 0;									\
+		buf1[1] = src1[0];								\
+		buf2[0] = 0;									\
+		buf2[1] = src2[0];								\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, left_tz, weight1, weight2,				\
+			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
+		dst += left_tz;									\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += left_tz;								\
+		vx += left_tz * unit_x;								\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (dst, mask,							\
+			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
+		dst += width;									\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += width;								\
+		vx += width * unit_x;								\
+	    }											\
+	    if (right_tz > 0)									\
+	    {											\
+		buf1[0] = src1[src_image->bits.width - 1];					\
+		buf1[1] = 0;									\
+		buf2[0] = src2[src_image->bits.width - 1];					\
+		buf2[1] = 0;									\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, right_tz, weight1, weight2,				\
+			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
+		dst += right_tz;								\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += right_tz;								\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		buf1[0] = buf1[1] = 0;								\
+		buf2[0] = buf2[1] = 0;								\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, TRUE);		\
+	    }											\
+	}											\
+	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	{											\
+	    int32_t	    num_pixels;								\
+	    int32_t	    width_remain;							\
+	    src_type_t *    src_line_top;							\
+	    src_type_t *    src_line_bottom;							\
+	    src_type_t	    buf1[2];								\
+	    src_type_t	    buf2[2];								\
+	    src_type_t	    extended_src_line0[REPEAT_NORMAL_MIN_WIDTH*2];			\
+	    src_type_t	    extended_src_line1[REPEAT_NORMAL_MIN_WIDTH*2];			\
+	    int		    i, j;								\
+												\
+	    repeat (PIXMAN_REPEAT_NORMAL, &y1, src_image->bits.height);				\
+	    repeat (PIXMAN_REPEAT_NORMAL, &y2, src_image->bits.height);				\
+	    src_line_top = src_first_line + src_stride * y1;					\
+	    src_line_bottom = src_first_line + src_stride * y2;					\
+												\
+	    if (need_src_extension)								\
+	    {											\
+		for (i=0; i<src_width;)								\
+		{										\
+		    for (j=0; j<src_image->bits.width; j++, i++)				\
+		    {										\
+			extended_src_line0[i] = src_line_top[j];				\
+			extended_src_line1[i] = src_line_bottom[j];				\
+		    }										\
+		}										\
+												\
+		src_line_top = &extended_src_line0[0];						\
+		src_line_bottom = &extended_src_line1[0];					\
+	    }											\
+												\
+	    /* Top & Bottom wrap around buffer */						\
+	    buf1[0] = src_line_top[src_width - 1];						\
+	    buf1[1] = src_line_top[0];								\
+	    buf2[0] = src_line_bottom[src_width - 1];						\
+	    buf2[1] = src_line_bottom[0];							\
+												\
+	    width_remain = width;								\
+												\
+	    while (width_remain > 0)								\
+	    {											\
+		/* We use src_width_fixed because it can make vx in original source range */	\
+		repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);				\
+												\
+		/* Wrap around part */								\
+		if (pixman_fixed_to_int (vx) == src_width - 1)					\
+		{										\
+		    /* for positive unit_x							\
+		     * num_pixels = max(n) + 1, where vx + n*unit_x < src_width_fixed		\
+		     *										\
+		     * vx is in range [0, src_width_fixed - pixman_fixed_e]			\
+		     * So we are safe from overflow.						\
+		     */										\
+		    num_pixels = ((src_width_fixed - vx - pixman_fixed_e) / unit_x) + 1;	\
+												\
+		    if (num_pixels > width_remain)						\
+			num_pixels = width_remain;						\
+												\
+		    scanline_func (dst, mask, buf1, buf2, num_pixels,				\
+				   weight1, weight2, pixman_fixed_frac(vx),			\
+				   unit_x, src_width_fixed, FALSE);				\
+												\
+		    width_remain -= num_pixels;							\
+		    vx += num_pixels * unit_x;							\
+		    dst += num_pixels;								\
+												\
+		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\
+			mask += num_pixels;							\
+												\
+		    repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);			\
+		}										\
+												\
+		/* Normal scanline composite */							\
+		if (pixman_fixed_to_int (vx) != src_width - 1 && width_remain > 0)		\
+		{										\
+		    /* for positive unit_x							\
+		     * num_pixels = max(n) + 1, where vx + n*unit_x < (src_width_fixed - 1)	\
+		     *										\
+		     * vx is in range [0, src_width_fixed - pixman_fixed_e]			\
+		     * So we are safe from overflow here.					\
+		     */										\
+		    num_pixels = ((src_width_fixed - pixman_fixed_1 - vx - pixman_fixed_e)	\
+				  / unit_x) + 1;						\
+												\
+		    if (num_pixels > width_remain)						\
+			num_pixels = width_remain;						\
+												\
+		    scanline_func (dst, mask, src_line_top, src_line_bottom, num_pixels,	\
+				   weight1, weight2, vx, unit_x, src_width_fixed, FALSE);	\
+												\
+		    width_remain -= num_pixels;							\
+		    vx += num_pixels * unit_x;							\
+		    dst += num_pixels;								\
+												\
+		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\
+		        mask += num_pixels;							\
+		}										\
+	    }											\
+	}											\
+	else											\
+	{											\
+	    scanline_func (dst, mask, src_first_line + src_stride * y1,				\
+			   src_first_line + src_stride * y2, width,				\
+			   weight1, weight2, vx, unit_x, max_vx, FALSE);			\
+	}											\
+    }												\
+}
+
+/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
+#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, flags)				\
+	FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,\
+				  dst_type_t, repeat_mode, flags)
+
+#define SCALED_BILINEAR_FLAGS						\
+    (FAST_PATH_SCALE_TRANSFORM	|					\
+     FAST_PATH_NO_ALPHA_MAP	|					\
+     FAST_PATH_BILINEAR_FILTER	|					\
+     FAST_PATH_NO_ACCESSORS	|					\
+     FAST_PATH_NARROW_FORMAT)
+
+#define SIMPLE_BILINEAR_FAST_PATH_PAD(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_FAST_PATH_NONE(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_FAST_PATH_COVER(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,	\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_FAST_PATH_NORMAL(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,	\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,	\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)	\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
+    }
+
+/* Prefer the use of 'cover' variant, because it is faster */
+#define SIMPLE_BILINEAR_FAST_PATH(op,s,d,func)				\
+    SIMPLE_BILINEAR_FAST_PATH_COVER (op,s,d,func),			\
+    SIMPLE_BILINEAR_FAST_PATH_NONE (op,s,d,func),			\
+    SIMPLE_BILINEAR_FAST_PATH_PAD (op,s,d,func),			\
+    SIMPLE_BILINEAR_FAST_PATH_NORMAL (op,s,d,func)
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH(op,s,d,func)			\
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER (op,s,d,func),		\
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE (op,s,d,func),		\
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD (op,s,d,func),		\
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NORMAL (op,s,d,func)
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH(op,s,d,func)		\
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD (op,s,d,func),		\
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NORMAL (op,s,d,func)
+
+#endif
diff --git a/programs/develop/libraries/pixman/pixman-linear-gradient.c b/programs/develop/libraries/pixman/pixman-linear-gradient.c
index b048e7b065..40c8c9f37d 100644
--- a/programs/develop/libraries/pixman/pixman-linear-gradient.c
+++ b/programs/develop/libraries/pixman/pixman-linear-gradient.c
@@ -31,36 +31,32 @@
 #include <stdlib.h>
 #include "pixman-private.h"
 
-static source_image_class_t
-linear_gradient_classify (pixman_image_t *image,
-                          int             x,
-                          int             y,
-                          int             width,
-                          int             height)
+static pixman_bool_t
+linear_gradient_is_horizontal (pixman_image_t *image,
+			       int             x,
+			       int             y,
+			       int             width,
+			       int             height)
 {
-    source_image_t *source = (source_image_t *)image;
     linear_gradient_t *linear = (linear_gradient_t *)image;
     pixman_vector_t v;
     pixman_fixed_32_32_t l;
     pixman_fixed_48_16_t dx, dy;
     double inc;
-    source_image_class_t class;
 
-    class = SOURCE_IMAGE_CLASS_UNKNOWN;
-
-    if (source->common.transform)
+    if (image->common.transform)
     {
 	/* projective transformation */
-	if (source->common.transform->matrix[2][0] != 0 ||
-	    source->common.transform->matrix[2][1] != 0 ||
-	    source->common.transform->matrix[2][2] == 0)
+	if (image->common.transform->matrix[2][0] != 0 ||
+	    image->common.transform->matrix[2][1] != 0 ||
+	    image->common.transform->matrix[2][2] == 0)
 	{
-	    return class;
+	    return FALSE;
 	}
 
-	v.vector[0] = source->common.transform->matrix[0][1];
-	v.vector[1] = source->common.transform->matrix[1][1];
-	v.vector[2] = source->common.transform->matrix[2][2];
+	v.vector[0] = image->common.transform->matrix[0][1];
+	v.vector[1] = image->common.transform->matrix[1][1];
+	v.vector[2] = image->common.transform->matrix[2][2];
     }
     else
     {
@@ -75,7 +71,7 @@ linear_gradient_classify (pixman_image_t *image,
     l = dx * dx + dy * dy;
 
     if (l == 0)
-	return class;	
+	return FALSE;
 
     /*
      * compute how much the input of the gradient walked changes
@@ -87,43 +83,44 @@ linear_gradient_classify (pixman_image_t *image,
 
     /* check that casting to integer would result in 0 */
     if (-1 < inc && inc < 1)
-	class = SOURCE_IMAGE_CLASS_HORIZONTAL;
+	return TRUE;
 
-    return class;
+    return FALSE;
 }
 
-static void
-linear_gradient_get_scanline_32 (pixman_image_t *image,
-                                 int             x,
-                                 int             y,
-                                 int             width,
-                                 uint32_t *      buffer,
-                                 const uint32_t *mask)
+static uint32_t *
+linear_get_scanline_narrow (pixman_iter_t  *iter,
+			    const uint32_t *mask)
 {
+    pixman_image_t *image  = iter->image;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    uint32_t *      buffer = iter->buffer;
+
     pixman_vector_t v, unit;
     pixman_fixed_32_32_t l;
     pixman_fixed_48_16_t dx, dy;
     gradient_t *gradient = (gradient_t *)image;
-    source_image_t *source = (source_image_t *)image;
     linear_gradient_t *linear = (linear_gradient_t *)image;
     uint32_t *end = buffer + width;
     pixman_gradient_walker_t walker;
 
-    _pixman_gradient_walker_init (&walker, gradient, source->common.repeat);
+    _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
 
     /* reference point is the center of the pixel */
     v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
     v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
     v.vector[2] = pixman_fixed_1;
 
-    if (source->common.transform)
+    if (image->common.transform)
     {
-	if (!pixman_transform_point_3d (source->common.transform, &v))
-	    return;
-	
-	unit.vector[0] = source->common.transform->matrix[0][0];
-	unit.vector[1] = source->common.transform->matrix[1][0];
-	unit.vector[2] = source->common.transform->matrix[2][0];
+	if (!pixman_transform_point_3d (image->common.transform, &v))
+	    return iter->buffer;
+
+	unit.vector[0] = image->common.transform->matrix[0][0];
+	unit.vector[1] = image->common.transform->matrix[1][0];
+	unit.vector[2] = image->common.transform->matrix[2][0];
     }
     else
     {
@@ -219,18 +216,48 @@ linear_gradient_get_scanline_32 (pixman_image_t *image,
 	    v.vector[2] += unit.vector[2];
 	}
     }
+
+    iter->y++;
+
+    return iter->buffer;
 }
 
-static void
-linear_gradient_property_changed (pixman_image_t *image)
+static uint32_t *
+linear_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
 {
-    image->common.get_scanline_32 = linear_gradient_get_scanline_32;
-    image->common.get_scanline_64 = _pixman_image_get_scanline_generic_64;
+    uint32_t *buffer = linear_get_scanline_narrow (iter, NULL);
+
+    pixman_expand_to_float (
+	(argb_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+
+    return buffer;
+}
+
+void
+_pixman_linear_gradient_iter_init (pixman_image_t *image, pixman_iter_t  *iter)
+{
+    if (linear_gradient_is_horizontal (
+	    iter->image, iter->x, iter->y, iter->width, iter->height))
+    {
+	if (iter->iter_flags & ITER_NARROW)
+	    linear_get_scanline_narrow (iter, NULL);
+	else
+	    linear_get_scanline_wide (iter, NULL);
+
+	iter->get_scanline = _pixman_iter_get_scanline_noop;
+    }
+    else
+    {
+	if (iter->iter_flags & ITER_NARROW)
+	    iter->get_scanline = linear_get_scanline_narrow;
+	else
+	    iter->get_scanline = linear_get_scanline_wide;
+    }
 }
 
 PIXMAN_EXPORT pixman_image_t *
-pixman_image_create_linear_gradient (pixman_point_fixed_t *        p1,
-                                     pixman_point_fixed_t *        p2,
+pixman_image_create_linear_gradient (const pixman_point_fixed_t *  p1,
+                                     const pixman_point_fixed_t *  p2,
                                      const pixman_gradient_stop_t *stops,
                                      int                           n_stops)
 {
@@ -254,8 +281,6 @@ pixman_image_create_linear_gradient (pixman_point_fixed_t *        p1,
     linear->p2 = *p2;
 
     image->type = LINEAR;
-    image->common.classify = linear_gradient_classify;
-    image->common.property_changed = linear_gradient_property_changed;
 
     return image;
 }
diff --git a/programs/develop/libraries/pixman/pixman-matrix.c b/programs/develop/libraries/pixman/pixman-matrix.c
index abdfa05258..89b96826b8 100644
--- a/programs/develop/libraries/pixman/pixman-matrix.c
+++ b/programs/develop/libraries/pixman/pixman-matrix.c
@@ -25,7 +25,7 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#include "config.h"
+#include <config.h>
 #endif
 
 #include <math.h>
@@ -34,6 +34,338 @@
 
 #define F(x)    pixman_int_to_fixed (x)
 
+static force_inline int
+count_leading_zeros (uint32_t x)
+{
+#ifdef __GNUC__
+    return __builtin_clz (x);
+#else
+    int n = 0;
+    while (x)
+    {
+        n++;
+        x >>= 1;
+    }
+    return 32 - n;
+#endif
+}
+
+/*
+ * Large signed/unsigned integer division with rounding for the platforms with
+ * only 64-bit integer data type supported (no 128-bit data type).
+ *
+ * Arguments:
+ *     hi, lo - high and low 64-bit parts of the dividend
+ *     div    - 48-bit divisor
+ *
+ * Returns: lowest 64 bits of the result as a return value and highest 64
+ *          bits of the result to "result_hi" pointer
+ */
+
+/* grade-school unsigned division (128-bit by 48-bit) with rounding to nearest */
+static force_inline uint64_t
+rounded_udiv_128_by_48 (uint64_t  hi,
+                        uint64_t  lo,
+                        uint64_t  div,
+                        uint64_t *result_hi)
+{
+    uint64_t tmp, remainder, result_lo;
+    assert(div < ((uint64_t)1 << 48));
+
+    remainder = hi % div;
+    *result_hi = hi / div;
+
+    tmp = (remainder << 16) + (lo >> 48);
+    result_lo = tmp / div;
+    remainder = tmp % div;
+
+    tmp = (remainder << 16) + ((lo >> 32) & 0xFFFF);
+    result_lo = (result_lo << 16) + (tmp / div);
+    remainder = tmp % div;
+
+    tmp = (remainder << 16) + ((lo >> 16) & 0xFFFF);
+    result_lo = (result_lo << 16) + (tmp / div);
+    remainder = tmp % div;
+
+    tmp = (remainder << 16) + (lo & 0xFFFF);
+    result_lo = (result_lo << 16) + (tmp / div);
+    remainder = tmp % div;
+
+    /* round to nearest */
+    if (remainder * 2 >= div && ++result_lo == 0)
+        *result_hi += 1;
+
+    return result_lo;
+}
+
+/* signed division (128-bit by 49-bit) with rounding to nearest */
+static inline int64_t
+rounded_sdiv_128_by_49 (int64_t   hi,
+                        uint64_t  lo,
+                        int64_t   div,
+                        int64_t  *signed_result_hi)
+{
+    uint64_t result_lo, result_hi;
+    int sign = 0;
+    if (div < 0)
+    {
+        div = -div;
+        sign ^= 1;
+    }
+    if (hi < 0)
+    {
+        if (lo != 0)
+            hi++;
+        hi = -hi;
+        lo = -lo;
+        sign ^= 1;
+    }
+    result_lo = rounded_udiv_128_by_48 (hi, lo, div, &result_hi);
+    if (sign)
+    {
+        if (result_lo != 0)
+            result_hi++;
+        result_hi = -result_hi;
+        result_lo = -result_lo;
+    }
+    if (signed_result_hi)
+    {
+        *signed_result_hi = result_hi;
+    }
+    return result_lo;
+}
+
+/*
+ * Multiply 64.16 fixed point value by (2^scalebits) and convert
+ * to 128-bit integer.
+ */
+static force_inline void
+fixed_64_16_to_int128 (int64_t  hi,
+                       int64_t  lo,
+                       int64_t *rhi,
+                       int64_t *rlo,
+                       int      scalebits)
+{
+    /* separate integer and fractional parts */
+    hi += lo >> 16;
+    lo &= 0xFFFF;
+
+    if (scalebits <= 0)
+    {
+        *rlo = hi >> (-scalebits);
+        *rhi = *rlo >> 63;
+    }
+    else
+    {
+        *rhi = hi >> (64 - scalebits);
+        *rlo = (uint64_t)hi << scalebits;
+        if (scalebits < 16)
+            *rlo += lo >> (16 - scalebits);
+        else
+            *rlo += lo << (scalebits - 16);
+    }
+}
+
+/*
+ * Convert 112.16 fixed point value to 48.16 with clamping for the out
+ * of range values.
+ */
+static force_inline pixman_fixed_48_16_t
+fixed_112_16_to_fixed_48_16 (int64_t hi, int64_t lo, pixman_bool_t *clampflag)
+{
+    if ((lo >> 63) != hi)
+    {
+        *clampflag = TRUE;
+        return hi >= 0 ? INT64_MAX : INT64_MIN;
+    }
+    else
+    {
+        return lo;
+    }
+}
+
+/*
+ * Transform a point with 31.16 fixed point coordinates from the destination
+ * space to a point with 48.16 fixed point coordinates in the source space.
+ * No overflows are possible for affine transformations and the results are
+ * accurate including the least significant bit. Projective transformations
+ * may overflow, in this case the results are just clamped to return maximum
+ * or minimum 48.16 values (so that the caller can at least handle the NONE
+ * and PAD repeats correctly) and the return value is FALSE to indicate that
+ * such clamping has happened.
+ */
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_point_31_16 (const pixman_transform_t    *t,
+                              const pixman_vector_48_16_t *v,
+                              pixman_vector_48_16_t       *result)
+{
+    pixman_bool_t clampflag = FALSE;
+    int i;
+    int64_t tmp[3][2], divint;
+    uint16_t divfrac;
+
+    /* input vector values must have no more than 31 bits (including sign)
+     * in the integer part */
+    assert (v->v[0] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[2] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[2] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+
+    for (i = 0; i < 3; i++)
+    {
+        tmp[i][0] = (int64_t)t->matrix[i][0] * (v->v[0] >> 16);
+        tmp[i][1] = (int64_t)t->matrix[i][0] * (v->v[0] & 0xFFFF);
+        tmp[i][0] += (int64_t)t->matrix[i][1] * (v->v[1] >> 16);
+        tmp[i][1] += (int64_t)t->matrix[i][1] * (v->v[1] & 0xFFFF);
+        tmp[i][0] += (int64_t)t->matrix[i][2] * (v->v[2] >> 16);
+        tmp[i][1] += (int64_t)t->matrix[i][2] * (v->v[2] & 0xFFFF);
+    }
+
+    /*
+     * separate 64-bit integer and 16-bit fractional parts for the divisor,
+     * which is also scaled by 65536 after fixed point multiplication.
+     */
+    divint  = tmp[2][0] + (tmp[2][1] >> 16);
+    divfrac = tmp[2][1] & 0xFFFF;
+
+    if (divint == pixman_fixed_1 && divfrac == 0)
+    {
+        /*
+         * this is a simple affine transformation
+         */
+        result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16);
+        result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16);
+        result->v[2] = pixman_fixed_1;
+    }
+    else if (divint == 0 && divfrac == 0)
+    {
+        /*
+         * handle zero divisor (if the values are non-zero, set the
+         * results to maximum positive or minimum negative)
+         */
+        clampflag = TRUE;
+
+        result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16);
+        result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16);
+
+        if (result->v[0] > 0)
+            result->v[0] = INT64_MAX;
+        else if (result->v[0] < 0)
+            result->v[0] = INT64_MIN;
+
+        if (result->v[1] > 0)
+            result->v[1] = INT64_MAX;
+        else if (result->v[1] < 0)
+            result->v[1] = INT64_MIN;
+    }
+    else
+    {
+        /*
+         * projective transformation, analyze the top 32 bits of the divisor
+         */
+        int32_t hi32divbits = divint >> 32;
+        if (hi32divbits < 0)
+            hi32divbits = ~hi32divbits;
+
+        if (hi32divbits == 0)
+        {
+            /* the divisor is small, we can actually keep all the bits */
+            int64_t hi, rhi, lo, rlo;
+            int64_t div = (divint << 16) + divfrac;
+
+            fixed_64_16_to_int128 (tmp[0][0], tmp[0][1], &hi, &lo, 32);
+            rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi);
+            result->v[0] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag);
+
+            fixed_64_16_to_int128 (tmp[1][0], tmp[1][1], &hi, &lo, 32);
+            rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi);
+            result->v[1] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag);
+        }
+        else
+        {
+            /* the divisor needs to be reduced to 48 bits */
+            int64_t hi, rhi, lo, rlo, div;
+            int shift = 32 - count_leading_zeros (hi32divbits);
+            fixed_64_16_to_int128 (divint, divfrac, &hi, &div, 16 - shift);
+
+            fixed_64_16_to_int128 (tmp[0][0], tmp[0][1], &hi, &lo, 32 - shift);
+            rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi);
+            result->v[0] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag);
+
+            fixed_64_16_to_int128 (tmp[1][0], tmp[1][1], &hi, &lo, 32 - shift);
+            rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi);
+            result->v[1] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag);
+        }
+    }
+    result->v[2] = pixman_fixed_1;
+    return !clampflag;
+}
+
+PIXMAN_EXPORT void
+pixman_transform_point_31_16_affine (const pixman_transform_t    *t,
+                                     const pixman_vector_48_16_t *v,
+                                     pixman_vector_48_16_t       *result)
+{
+    int64_t hi0, lo0, hi1, lo1;
+
+    /* input vector values must have no more than 31 bits (including sign)
+     * in the integer part */
+    assert (v->v[0] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+
+    hi0  = (int64_t)t->matrix[0][0] * (v->v[0] >> 16);
+    lo0  = (int64_t)t->matrix[0][0] * (v->v[0] & 0xFFFF);
+    hi0 += (int64_t)t->matrix[0][1] * (v->v[1] >> 16);
+    lo0 += (int64_t)t->matrix[0][1] * (v->v[1] & 0xFFFF);
+    hi0 += (int64_t)t->matrix[0][2];
+
+    hi1  = (int64_t)t->matrix[1][0] * (v->v[0] >> 16);
+    lo1  = (int64_t)t->matrix[1][0] * (v->v[0] & 0xFFFF);
+    hi1 += (int64_t)t->matrix[1][1] * (v->v[1] >> 16);
+    lo1 += (int64_t)t->matrix[1][1] * (v->v[1] & 0xFFFF);
+    hi1 += (int64_t)t->matrix[1][2];
+
+    result->v[0] = hi0 + ((lo0 + 0x8000) >> 16);
+    result->v[1] = hi1 + ((lo1 + 0x8000) >> 16);
+    result->v[2] = pixman_fixed_1;
+}
+
+PIXMAN_EXPORT void
+pixman_transform_point_31_16_3d (const pixman_transform_t    *t,
+                                 const pixman_vector_48_16_t *v,
+                                 pixman_vector_48_16_t       *result)
+{
+    int i;
+    int64_t tmp[3][2];
+
+    /* input vector values must have no more than 31 bits (including sign)
+     * in the integer part */
+    assert (v->v[0] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[2] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[2] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+
+    for (i = 0; i < 3; i++)
+    {
+        tmp[i][0] = (int64_t)t->matrix[i][0] * (v->v[0] >> 16);
+        tmp[i][1] = (int64_t)t->matrix[i][0] * (v->v[0] & 0xFFFF);
+        tmp[i][0] += (int64_t)t->matrix[i][1] * (v->v[1] >> 16);
+        tmp[i][1] += (int64_t)t->matrix[i][1] * (v->v[1] & 0xFFFF);
+        tmp[i][0] += (int64_t)t->matrix[i][2] * (v->v[2] >> 16);
+        tmp[i][1] += (int64_t)t->matrix[i][2] * (v->v[2] & 0xFFFF);
+    }
+
+    result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16);
+    result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16);
+    result->v[2] = tmp[2][0] + ((tmp[2][1] + 0x8000) >> 16);
+}
+
 PIXMAN_EXPORT void
 pixman_transform_init_identity (struct pixman_transform *matrix)
 {
@@ -50,69 +382,41 @@ PIXMAN_EXPORT pixman_bool_t
 pixman_transform_point_3d (const struct pixman_transform *transform,
                            struct pixman_vector *         vector)
 {
-    struct pixman_vector result;
-    pixman_fixed_32_32_t partial;
-    pixman_fixed_48_16_t v;
-    int i, j;
+    pixman_vector_48_16_t tmp;
+    tmp.v[0] = vector->vector[0];
+    tmp.v[1] = vector->vector[1];
+    tmp.v[2] = vector->vector[2];
 
-    for (j = 0; j < 3; j++)
-    {
-	v = 0;
-	for (i = 0; i < 3; i++)
-	{
-	    partial = ((pixman_fixed_48_16_t) transform->matrix[j][i] *
-	               (pixman_fixed_48_16_t) vector->vector[i]);
-	    v += partial >> 16;
-	}
-	
-	if (v > pixman_max_fixed_48_16 || v < pixman_min_fixed_48_16)
-	    return FALSE;
-	
-	result.vector[j] = (pixman_fixed_t) v;
-    }
-    
-    *vector = result;
+    pixman_transform_point_31_16_3d (transform, &tmp, &tmp);
 
-    if (!result.vector[2])
-	return FALSE;
+    vector->vector[0] = tmp.v[0];
+    vector->vector[1] = tmp.v[1];
+    vector->vector[2] = tmp.v[2];
 
-    return TRUE;
+    return vector->vector[0] == tmp.v[0] &&
+           vector->vector[1] == tmp.v[1] &&
+           vector->vector[2] == tmp.v[2];
 }
 
 PIXMAN_EXPORT pixman_bool_t
 pixman_transform_point (const struct pixman_transform *transform,
                         struct pixman_vector *         vector)
 {
-    pixman_fixed_32_32_t partial;
-    pixman_fixed_34_30_t v[3];
-    pixman_fixed_48_16_t quo;
-    int i, j;
+    pixman_vector_48_16_t tmp;
+    tmp.v[0] = vector->vector[0];
+    tmp.v[1] = vector->vector[1];
+    tmp.v[2] = vector->vector[2];
 
-    for (j = 0; j < 3; j++)
-    {
-	v[j] = 0;
-	
-	for (i = 0; i < 3; i++)
-	{
-	    partial = ((pixman_fixed_32_32_t) transform->matrix[j][i] *
-	               (pixman_fixed_32_32_t) vector->vector[i]);
-	    v[j] += partial >> 2;
-	}
-    }
-    
-    if (!(v[2] >> 16))
-	return FALSE;
+    if (!pixman_transform_point_31_16 (transform, &tmp, &tmp))
+        return FALSE;
 
-    for (j = 0; j < 2; j++)
-    {
-	quo = v[j] / (v[2] >> 16);
-	if (quo > pixman_max_fixed_48_16 || quo < pixman_min_fixed_48_16)
-	    return FALSE;
-	vector->vector[j] = (pixman_fixed_t) quo;
-    }
-    
-    vector->vector[2] = pixman_fixed_1;
-    return TRUE;
+    vector->vector[0] = tmp.v[0];
+    vector->vector[1] = tmp.v[1];
+    vector->vector[2] = tmp.v[2];
+
+    return vector->vector[0] == tmp.v[0] &&
+           vector->vector[1] == tmp.v[1] &&
+           vector->vector[2] == tmp.v[2];
 }
 
 PIXMAN_EXPORT pixman_bool_t
@@ -138,7 +442,7 @@ pixman_transform_multiply (struct pixman_transform *      dst,
 		    (pixman_fixed_32_32_t) l->matrix[dy][o] *
 		    (pixman_fixed_32_32_t) r->matrix[o][dx];
 
-		v += partial >> 16;
+		v += (partial + 0x8000) >> 16;
 	    }
 
 	    if (v > pixman_max_fixed_48_16 || v < pixman_min_fixed_48_16)
@@ -336,14 +640,14 @@ PIXMAN_EXPORT pixman_bool_t
 pixman_transform_invert (struct pixman_transform *      dst,
                          const struct pixman_transform *src)
 {
-    struct pixman_f_transform m, r;
+    struct pixman_f_transform m;
 
     pixman_f_transform_from_pixman_transform (&m, src);
 
-    if (!pixman_f_transform_invert (&r, &m))
+    if (!pixman_f_transform_invert (&m, &m))
 	return FALSE;
 
-    if (!pixman_transform_from_pixman_f_transform (dst, &r))
+    if (!pixman_transform_from_pixman_f_transform (dst, &m))
 	return FALSE;
 
     return TRUE;
@@ -425,7 +729,8 @@ pixman_transform_is_inverse (const struct pixman_transform *a,
 {
     struct pixman_transform t;
 
-    pixman_transform_multiply (&t, a, b);
+    if (!pixman_transform_multiply (&t, a, b))
+	return FALSE;
 
     return pixman_transform_is_identity (&t);
 }
@@ -464,17 +769,15 @@ pixman_transform_from_pixman_f_transform (struct pixman_transform *        t,
     return TRUE;
 }
 
-static const int a[3] = { 3, 3, 2 };
-static const int b[3] = { 2, 1, 1 };
-
 PIXMAN_EXPORT pixman_bool_t
 pixman_f_transform_invert (struct pixman_f_transform *      dst,
                            const struct pixman_f_transform *src)
 {
+    static const int a[3] = { 2, 2, 1 };
+    static const int b[3] = { 1, 0, 0 };
+    pixman_f_transform_t d;
     double det;
     int i, j;
-    static int a[3] = { 2, 2, 1 };
-    static int b[3] = { 1, 0, 0 };
 
     det = 0;
     for (i = 0; i < 3; i++)
@@ -509,10 +812,12 @@ pixman_f_transform_invert (struct pixman_f_transform *      dst,
 	    if (((i + j) & 1) != 0)
 		p = -p;
 	    
-	    dst->m[j][i] = det * p;
+	    d.m[j][i] = det * p;
 	}
     }
 
+    *dst = d;
+
     return TRUE;
 }
 
diff --git a/programs/develop/libraries/pixman/pixman-mmx.c b/programs/develop/libraries/pixman/pixman-mmx.c
index 34637a4fe3..14790c029f 100644
--- a/programs/develop/libraries/pixman/pixman-mmx.c
+++ b/programs/develop/libraries/pixman/pixman-mmx.c
@@ -33,13 +33,16 @@
 #include <config.h>
 #endif
 
-#ifdef USE_MMX
+#if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
 
+#ifdef USE_LOONGSON_MMI
+#include <loongson-mmintrin.h>
+#else
 #include <mmintrin.h>
+#endif
 #include "pixman-private.h"
 #include "pixman-combine32.h"
-
-#define no_vERBOSE
+#include "pixman-inlines.h"
 
 #ifdef VERBOSE
 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
@@ -47,6 +50,79 @@
 #define CHECKPOINT()
 #endif
 
+#if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8
+/* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_empty (void)
+{
+
+}
+#endif
+
+#ifdef USE_X86_MMX
+# if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
+#  include <xmmintrin.h>
+# else
+/* We have to compile with -msse to use xmmintrin.h, but that causes SSE
+ * instructions to be generated that we don't want. Just duplicate the
+ * functions we want to use.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_pi8 (__m64 __A)
+{
+    int ret;
+
+    asm ("pmovmskb %1, %0\n\t"
+	: "=r" (ret)
+	: "y" (__A)
+    );
+
+    return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_pu16 (__m64 __A, __m64 __B)
+{
+    asm ("pmulhuw %1, %0\n\t"
+	: "+y" (__A)
+	: "y" (__B)
+    );
+    return __A;
+}
+
+#  ifdef __OPTIMIZE__
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pi16 (__m64 __A, int8_t const __N)
+{
+    __m64 ret;
+
+    asm ("pshufw %2, %1, %0\n\t"
+	: "=y" (ret)
+	: "y" (__A), "K" (__N)
+    );
+
+    return ret;
+}
+#  else
+#   define _mm_shuffle_pi16(A, N)					\
+    ({									\
+	__m64 ret;							\
+									\
+	asm ("pshufw %2, %1, %0\n\t"					\
+	     : "=y" (ret)						\
+	     : "y" (A), "K" ((const int8_t)N)				\
+	);								\
+									\
+	ret;								\
+    })
+#  endif
+# endif
+#endif
+
+#ifndef _MSC_VER
+#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
+ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
+#endif
+
 /* Notes about writing mmx code
  *
  * give memory operands as the second operand. If you give it as the
@@ -68,17 +144,41 @@
 
 /* --------------- MMX primitives ------------------------------------- */
 
-#ifdef __GNUC__
+/* If __m64 is defined as a struct or union, then define M64_MEMBER to be
+ * the name of the member used to access the data.
+ * If __m64 requires using mm_cvt* intrinsics functions to convert between
+ * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
+ * If __m64 and uint64_t values can just be cast to each other directly,
+ * then define USE_M64_CASTS.
+ * If __m64 is a double datatype, then define USE_M64_DOUBLE.
+ */
+#ifdef _MSC_VER
+# define M64_MEMBER m64_u64
+#elif defined(__ICC)
+# define USE_CVT_INTRINSICS
+#elif defined(USE_LOONGSON_MMI)
+# define USE_M64_DOUBLE
+#elif defined(__GNUC__)
+# define USE_M64_CASTS
+#elif defined(__SUNPRO_C)
+# if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
+/* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
+ * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
+ * is defined.   If it is used, then the mm_cvt* intrinsics must be used.
+ */
+#  define USE_CVT_INTRINSICS
+# else
+/* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
+ * disabled, __m64 is defined as a struct containing "unsigned long long l_".
+ */
+#  define M64_MEMBER l_
+# endif
+#endif
+
+#if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
 typedef uint64_t mmxdatafield;
 #else
 typedef __m64 mmxdatafield;
-/* If __m64 is defined as a struct or union, define M64_MEMBER to be the
-   name of the member used to access the data */
-# ifdef _MSC_VER
-#  define M64_MEMBER m64_u64
-# elif defined(__SUNPRO_C)
-#  define M64_MEMBER l_
-# endif
 #endif
 
 typedef struct
@@ -87,24 +187,31 @@ typedef struct
     mmxdatafield mmx_4x0080;
     mmxdatafield mmx_565_rgb;
     mmxdatafield mmx_565_unpack_multiplier;
+    mmxdatafield mmx_565_pack_multiplier;
     mmxdatafield mmx_565_r;
     mmxdatafield mmx_565_g;
     mmxdatafield mmx_565_b;
+    mmxdatafield mmx_packed_565_rb;
+    mmxdatafield mmx_packed_565_g;
+    mmxdatafield mmx_expand_565_g;
+    mmxdatafield mmx_expand_565_b;
+    mmxdatafield mmx_expand_565_r;
+#ifndef USE_LOONGSON_MMI
     mmxdatafield mmx_mask_0;
     mmxdatafield mmx_mask_1;
     mmxdatafield mmx_mask_2;
     mmxdatafield mmx_mask_3;
+#endif
     mmxdatafield mmx_full_alpha;
-    mmxdatafield mmx_ffff0000ffff0000;
-    mmxdatafield mmx_0000ffff00000000;
-    mmxdatafield mmx_000000000000ffff;
+    mmxdatafield mmx_4x0101;
+    mmxdatafield mmx_ff000000;
 } mmx_data_t;
 
 #if defined(_MSC_VER)
 # define MMXDATA_INIT(field, val) { val ## UI64 }
 #elif defined(M64_MEMBER)       /* __m64 is a struct, not an integral type */
 # define MMXDATA_INIT(field, val) field =   { val ## ULL }
-#else                           /* __m64 is an integral type */
+#else                           /* mmxdatafield is an integral type */
 # define MMXDATA_INIT(field, val) field =   val ## ULL
 #endif
 
@@ -114,25 +221,32 @@ static const mmx_data_t c =
     MMXDATA_INIT (.mmx_4x0080,                   0x0080008000800080),
     MMXDATA_INIT (.mmx_565_rgb,                  0x000001f0003f001f),
     MMXDATA_INIT (.mmx_565_unpack_multiplier,    0x0000008404100840),
+    MMXDATA_INIT (.mmx_565_pack_multiplier,      0x2000000420000004),
     MMXDATA_INIT (.mmx_565_r,                    0x000000f800000000),
     MMXDATA_INIT (.mmx_565_g,                    0x0000000000fc0000),
     MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8),
+    MMXDATA_INIT (.mmx_packed_565_rb,            0x00f800f800f800f8),
+    MMXDATA_INIT (.mmx_packed_565_g,             0x0000fc000000fc00),
+    MMXDATA_INIT (.mmx_expand_565_g,             0x07e007e007e007e0),
+    MMXDATA_INIT (.mmx_expand_565_b,             0x001f001f001f001f),
+    MMXDATA_INIT (.mmx_expand_565_r,             0xf800f800f800f800),
+#ifndef USE_LOONGSON_MMI
     MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000),
     MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff),
     MMXDATA_INIT (.mmx_mask_2,                   0xffff0000ffffffff),
     MMXDATA_INIT (.mmx_mask_3,                   0x0000ffffffffffff),
+#endif
     MMXDATA_INIT (.mmx_full_alpha,               0x00ff000000000000),
-    MMXDATA_INIT (.mmx_ffff0000ffff0000,         0xffff0000ffff0000),
-    MMXDATA_INIT (.mmx_0000ffff00000000,         0x0000ffff00000000),
-    MMXDATA_INIT (.mmx_000000000000ffff,         0x000000000000ffff),
+    MMXDATA_INIT (.mmx_4x0101,                   0x0101010101010101),
+    MMXDATA_INIT (.mmx_ff000000,                 0xff000000ff000000),
 };
 
-#ifdef __GNUC__
-#    ifdef __ICC
-#        define MC(x) to_m64 (c.mmx_ ## x)
-#    else
-#        define MC(x) ((__m64)c.mmx_ ## x)
-#    endif
+#ifdef USE_CVT_INTRINSICS
+#    define MC(x) to_m64 (c.mmx_ ## x)
+#elif defined(USE_M64_CASTS)
+#    define MC(x) ((__m64)c.mmx_ ## x)
+#elif defined(USE_M64_DOUBLE)
+#    define MC(x) (*(__m64 *)&c.mmx_ ## x)
 #else
 #    define MC(x) c.mmx_ ## x
 #endif
@@ -140,14 +254,16 @@ static const mmx_data_t c =
 static force_inline __m64
 to_m64 (uint64_t x)
 {
-#ifdef __ICC
+#ifdef USE_CVT_INTRINSICS
     return _mm_cvtsi64_m64 (x);
 #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
     __m64 res;
 
     res.M64_MEMBER = x;
     return res;
-#else                           /* __m64 is an integral type */
+#elif defined USE_M64_DOUBLE
+    return *(__m64 *)&x;
+#else /* USE_M64_CASTS */
     return (__m64)x;
 #endif
 }
@@ -155,12 +271,14 @@ to_m64 (uint64_t x)
 static force_inline uint64_t
 to_uint64 (__m64 x)
 {
-#ifdef __ICC
+#ifdef USE_CVT_INTRINSICS
     return _mm_cvtm64_si64 (x);
 #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
     uint64_t res = x.M64_MEMBER;
     return res;
-#else                           /* __m64 is an integral type */
+#elif defined USE_M64_DOUBLE
+    return *(uint64_t *)&x;
+#else /* USE_M64_CASTS */
     return (uint64_t)x;
 #endif
 }
@@ -190,8 +308,7 @@ pix_multiply (__m64 a, __m64 b)
 
     res = _mm_mullo_pi16 (a, b);
     res = _mm_adds_pu16 (res, MC (4x0080));
-    res = _mm_adds_pu16 (res, _mm_srli_pi16 (res, 8));
-    res = _mm_srli_pi16 (res, 8);
+    res = _mm_mulhi_pu16 (res, MC (4x0101));
 
     return res;
 }
@@ -205,52 +322,19 @@ pix_add (__m64 a, __m64 b)
 static force_inline __m64
 expand_alpha (__m64 pixel)
 {
-    __m64 t1, t2;
-
-    t1 = shift (pixel, -48);
-    t2 = shift (t1, 16);
-    t1 = _mm_or_si64 (t1, t2);
-    t2 = shift (t1, 32);
-    t1 = _mm_or_si64 (t1, t2);
-
-    return t1;
+    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
 }
 
 static force_inline __m64
 expand_alpha_rev (__m64 pixel)
 {
-    __m64 t1, t2;
-
-    /* move alpha to low 16 bits and zero the rest */
-    t1 = shift (pixel,  48);
-    t1 = shift (t1, -48);
-
-    t2 = shift (t1, 16);
-    t1 = _mm_or_si64 (t1, t2);
-    t2 = shift (t1, 32);
-    t1 = _mm_or_si64 (t1, t2);
-
-    return t1;
+    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
 }
 
 static force_inline __m64
 invert_colors (__m64 pixel)
 {
-    __m64 x, y, z;
-
-    x = y = z = pixel;
-
-    x = _mm_and_si64 (x, MC (ffff0000ffff0000));
-    y = _mm_and_si64 (y, MC (000000000000ffff));
-    z = _mm_and_si64 (z, MC (0000ffff00000000));
-
-    y = shift (y, 32);
-    z = shift (z, -32);
-
-    x = _mm_or_si64 (x, y);
-    x = _mm_or_si64 (x, z);
-
-    return x;
+    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
 }
 
 static force_inline __m64
@@ -276,14 +360,6 @@ in (__m64 src, __m64 mask)
     return pix_multiply (src, mask);
 }
 
-static force_inline __m64
-in_over_full_src_alpha (__m64 src, __m64 mask, __m64 dest)
-{
-    src = _mm_or_si64 (src, MC (full_alpha));
-
-    return over (in (src, mask), mask, dest);
-}
-
 #ifndef _MSC_VER
 static force_inline __m64
 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
@@ -298,10 +374,69 @@ in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
 
 #endif
 
-static force_inline __m64
-load8888 (uint32_t v)
+/* Elemental unaligned loads */
+
+static force_inline __m64 ldq_u(__m64 *p)
 {
-    return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64 ());
+#ifdef USE_X86_MMX
+    /* x86's alignment restrictions are very relaxed. */
+    return *(__m64 *)p;
+#elif defined USE_ARM_IWMMXT
+    int align = (uintptr_t)p & 7;
+    __m64 *aligned_p;
+    if (align == 0)
+	return *p;
+    aligned_p = (__m64 *)((uintptr_t)p & ~7);
+    return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
+#else
+    struct __una_u64 { __m64 x __attribute__((packed)); };
+    const struct __una_u64 *ptr = (const struct __una_u64 *) p;
+    return (__m64) ptr->x;
+#endif
+}
+
+static force_inline uint32_t ldl_u(const uint32_t *p)
+{
+#ifdef USE_X86_MMX
+    /* x86's alignment restrictions are very relaxed. */
+    return *p;
+#else
+    struct __una_u32 { uint32_t x __attribute__((packed)); };
+    const struct __una_u32 *ptr = (const struct __una_u32 *) p;
+    return ptr->x;
+#endif
+}
+
+static force_inline __m64
+load (const uint32_t *v)
+{
+#ifdef USE_LOONGSON_MMI
+    __m64 ret;
+    asm ("lwc1 %0, %1\n\t"
+	: "=f" (ret)
+	: "m" (*v)
+    );
+    return ret;
+#else
+    return _mm_cvtsi32_si64 (*v);
+#endif
+}
+
+static force_inline __m64
+load8888 (const uint32_t *v)
+{
+#ifdef USE_LOONGSON_MMI
+    return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
+#else
+    return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
+#endif
+}
+
+static force_inline __m64
+load8888u (const uint32_t *v)
+{
+    uint32_t l = ldl_u (v);
+    return load8888 (&l);
 }
 
 static force_inline __m64
@@ -310,10 +445,53 @@ pack8888 (__m64 lo, __m64 hi)
     return _mm_packs_pu16 (lo, hi);
 }
 
-static force_inline uint32_t
-store8888 (__m64 v)
+static force_inline void
+store (uint32_t *dest, __m64 v)
 {
-    return _mm_cvtsi64_si32 (pack8888 (v, _mm_setzero_si64 ()));
+#ifdef USE_LOONGSON_MMI
+    asm ("swc1 %1, %0\n\t"
+	: "=m" (*dest)
+	: "f" (v)
+	: "memory"
+    );
+#else
+    *dest = _mm_cvtsi64_si32 (v);
+#endif
+}
+
+static force_inline void
+store8888 (uint32_t *dest, __m64 v)
+{
+    v = pack8888 (v, _mm_setzero_si64 ());
+    store (dest, v);
+}
+
+static force_inline pixman_bool_t
+is_equal (__m64 a, __m64 b)
+{
+#ifdef USE_LOONGSON_MMI
+    /* __m64 is double, we can compare directly. */
+    return a == b;
+#else
+    return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
+#endif
+}
+
+static force_inline pixman_bool_t
+is_opaque (__m64 v)
+{
+#ifdef USE_LOONGSON_MMI
+    return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
+#else
+    __m64 ffs = _mm_cmpeq_pi8 (v, v);
+    return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
+#endif
+}
+
+static force_inline pixman_bool_t
+is_zero (__m64 v)
+{
+    return is_equal (v, _mm_setzero_si64 ());
 }
 
 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
@@ -337,7 +515,11 @@ expand565 (__m64 pixel, int pos)
     __m64 t1, t2;
 
     /* move pixel to low 16 bit and zero the rest */
+#ifdef USE_LOONGSON_MMI
+    p = loongson_extract_pi16 (p, pos);
+#else
     p = shift (shift (p, (3 - pos) * 16), -48);
+#endif
 
     t1 = shift (p, 36 - 11);
     t2 = shift (p, 16 - 5);
@@ -350,6 +532,36 @@ expand565 (__m64 pixel, int pos)
     return _mm_srli_pi16 (pixel, 8);
 }
 
+/* Expand 4 16 bit pixels in an mmx register into two mmx registers of
+ *
+ *    AARRGGBBRRGGBB
+ */
+static force_inline void
+expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
+{
+    __m64 t0, t1, alpha = _mm_setzero_si64 ();
+    __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
+    __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
+    __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
+    if (full_alpha)
+	alpha = _mm_cmpeq_pi32 (alpha, alpha);
+
+    /* Replicate high bits into empty low bits. */
+    r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
+    g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
+    b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
+
+    r = _mm_packs_pu16 (r, _mm_setzero_si64 ());	/* 00 00 00 00 R3 R2 R1 R0 */
+    g = _mm_packs_pu16 (g, _mm_setzero_si64 ());	/* 00 00 00 00 G3 G2 G1 G0 */
+    b = _mm_packs_pu16 (b, _mm_setzero_si64 ());	/* 00 00 00 00 B3 B2 B1 B0 */
+
+    t1 = _mm_unpacklo_pi8 (r, alpha);			/* A3 R3 A2 R2 A1 R1 A0 R0 */
+    t0 = _mm_unpacklo_pi8 (b, g);			/* G3 B3 G2 B2 G1 B1 G0 B0 */
+
+    *vout0 = _mm_unpacklo_pi16 (t0, t1);		/* A1 R1 G1 B1 A0 R0 G0 B0 */
+    *vout1 = _mm_unpackhi_pi16 (t0, t1);		/* A3 R3 G3 B3 A2 R2 G2 B2 */
+}
+
 static force_inline __m64
 expand8888 (__m64 in, int pos)
 {
@@ -365,6 +577,17 @@ expandx888 (__m64 in, int pos)
     return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
 }
 
+static force_inline void
+expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha)
+{
+    __m64 v0, v1;
+    expand_4xpacked565 (vin, &v0, &v1, full_alpha);
+    *vout0 = expand8888 (v0, 0);
+    *vout1 = expand8888 (v0, 1);
+    *vout2 = expand8888 (v1, 0);
+    *vout3 = expand8888 (v1, 1);
+}
+
 static force_inline __m64
 pack_565 (__m64 pixel, __m64 target, int pos)
 {
@@ -376,6 +599,15 @@ pack_565 (__m64 pixel, __m64 target, int pos)
     g = _mm_and_si64 (p, MC (565_g));
     b = _mm_and_si64 (p, MC (565_b));
 
+#ifdef USE_LOONGSON_MMI
+    r = shift (r, -(32 - 8));
+    g = shift (g, -(16 - 3));
+    b = shift (b, -(0  + 3));
+
+    p = _mm_or_si64 (r, g);
+    p = _mm_or_si64 (p, b);
+    return loongson_insert_pi16 (t, p, pos);
+#else
     r = shift (r, -(32 - 8) + pos * 16);
     g = shift (g, -(16 - 3) + pos * 16);
     b = shift (b, -(0  + 3) + pos * 16);
@@ -393,10 +625,42 @@ pack_565 (__m64 pixel, __m64 target, int pos)
     p = _mm_or_si64 (g, p);
 
     return _mm_or_si64 (b, p);
+#endif
+}
+
+static force_inline __m64
+pack_4xpacked565 (__m64 a, __m64 b)
+{
+    __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
+    __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
+
+    __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
+    __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
+
+    __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
+    __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
+
+    t0 = _mm_or_si64 (t0, g0);
+    t1 = _mm_or_si64 (t1, g1);
+
+    t0 = shift(t0, -5);
+#ifdef USE_ARM_IWMMXT
+    t1 = shift(t1, -5);
+    return _mm_packs_pu32 (t0, t1);
+#else
+    t1 = shift(t1, -5 + 16);
+    return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
+#endif
 }
 
 #ifndef _MSC_VER
 
+static force_inline __m64
+pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
+{
+    return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
+}
+
 static force_inline __m64
 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
 {
@@ -408,32 +672,52 @@ pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
 
 #else
 
+/* MSVC only handles a "pass by register" of up to three SSE intrinsics */
+
+#define pack_4x565(v0, v1, v2, v3) \
+    pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
+
 #define pix_add_mul(x, a, y, b)	 \
     ( x = pix_multiply (x, a),	 \
-      y = pix_multiply (y, a),	 \
+      y = pix_multiply (y, b),	 \
       pix_add (x, y) )
 
 #endif
 
 /* --------------- MMX code patch for fbcompose.c --------------------- */
 
-static force_inline uint32_t
+static force_inline __m64
 combine (const uint32_t *src, const uint32_t *mask)
 {
-    uint32_t ssrc = *src;
+    __m64 vsrc = load8888 (src);
 
     if (mask)
     {
-	__m64 m = load8888 (*mask);
-	__m64 s = load8888 (ssrc);
+	__m64 m = load8888 (mask);
 
 	m = expand_alpha (m);
-	s = pix_multiply (s, m);
-
-	ssrc = store8888 (s);
+	vsrc = pix_multiply (vsrc, m);
     }
 
-    return ssrc;
+    return vsrc;
+}
+
+static force_inline __m64
+core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
+{
+    vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
+
+    if (is_opaque (vsrc))
+    {
+	return vsrc;
+    }
+    else if (!is_zero (vsrc))
+    {
+	return over (vsrc, expand_alpha (vsrc),
+		     _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
+    }
+
+    return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
 }
 
 static void
@@ -448,19 +732,16 @@ mmx_combine_over_u (pixman_implementation_t *imp,
 
     while (dest < end)
     {
-	uint32_t ssrc = combine (src, mask);
-	uint32_t a = ssrc >> 24;
+	__m64 vsrc = combine (src, mask);
 
-	if (a == 0xff)
+	if (is_opaque (vsrc))
 	{
-	    *dest = ssrc;
+	    store8888 (dest, vsrc);
 	}
-	else if (ssrc)
+	else if (!is_zero (vsrc))
 	{
-	    __m64 s, sa;
-	    s = load8888 (ssrc);
-	    sa = expand_alpha (s);
-	    *dest = store8888 (over (s, sa, load8888 (*dest)));
+	    __m64 sa = expand_alpha (vsrc);
+	    store8888 (dest, over (vsrc, sa, load8888 (dest)));
 	}
 
 	++dest;
@@ -484,11 +765,11 @@ mmx_combine_over_reverse_u (pixman_implementation_t *imp,
     while (dest < end)
     {
 	__m64 d, da;
-	uint32_t s = combine (src, mask);
+	__m64 s = combine (src, mask);
 
-	d = load8888 (*dest);
+	d = load8888 (dest);
 	da = expand_alpha (d);
-	*dest = store8888 (over (d, da, load8888 (s)));
+	store8888 (dest, over (d, da, s));
 
 	++dest;
 	++src;
@@ -510,14 +791,14 @@ mmx_combine_in_u (pixman_implementation_t *imp,
 
     while (dest < end)
     {
-	__m64 x, a;
+	__m64 a;
+	__m64 x = combine (src, mask);
 
-	x = load8888 (combine (src, mask));
-	a = load8888 (*dest);
+	a = load8888 (dest);
 	a = expand_alpha (a);
 	x = pix_multiply (x, a);
 
-	*dest = store8888 (x);
+	store8888 (dest, x);
 
 	++dest;
 	++src;
@@ -539,13 +820,13 @@ mmx_combine_in_reverse_u (pixman_implementation_t *imp,
 
     while (dest < end)
     {
-	__m64 x, a;
+	__m64 a = combine (src, mask);
+	__m64 x;
 
-	x = load8888 (*dest);
-	a = load8888 (combine (src, mask));
+	x = load8888 (dest);
 	a = expand_alpha (a);
 	x = pix_multiply (x, a);
-	*dest = store8888 (x);
+	store8888 (dest, x);
 
 	++dest;
 	++src;
@@ -567,14 +848,14 @@ mmx_combine_out_u (pixman_implementation_t *imp,
 
     while (dest < end)
     {
-	__m64 x, a;
+	__m64 a;
+	__m64 x = combine (src, mask);
 
-	x = load8888 (combine (src, mask));
-	a = load8888 (*dest);
+	a = load8888 (dest);
 	a = expand_alpha (a);
 	a = negate (a);
 	x = pix_multiply (x, a);
-	*dest = store8888 (x);
+	store8888 (dest, x);
 
 	++dest;
 	++src;
@@ -596,15 +877,15 @@ mmx_combine_out_reverse_u (pixman_implementation_t *imp,
 
     while (dest < end)
     {
-	__m64 x, a;
+	__m64 a = combine (src, mask);
+	__m64 x;
 
-	x = load8888 (*dest);
-	a = load8888 (combine (src, mask));
+	x = load8888 (dest);
 	a = expand_alpha (a);
 	a = negate (a);
 	x = pix_multiply (x, a);
 
-	*dest = store8888 (x);
+	store8888 (dest, x);
 
 	++dest;
 	++src;
@@ -626,15 +907,15 @@ mmx_combine_atop_u (pixman_implementation_t *imp,
 
     while (dest < end)
     {
-	__m64 s, da, d, sia;
+	__m64 da, d, sia;
+	__m64 s = combine (src, mask);
 
-	s = load8888 (combine (src, mask));
-	d = load8888 (*dest);
+	d = load8888 (dest);
 	sia = expand_alpha (s);
 	sia = negate (sia);
 	da = expand_alpha (d);
 	s = pix_add_mul (s, da, d, sia);
-	*dest = store8888 (s);
+	store8888 (dest, s);
 
 	++dest;
 	++src;
@@ -658,15 +939,15 @@ mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
 
     while (dest < end)
     {
-	__m64 s, dia, d, sa;
+	__m64 dia, d, sa;
+	__m64 s = combine (src, mask);
 
-	s = load8888 (combine (src, mask));
-	d = load8888 (*dest);
+	d = load8888 (dest);
 	sa = expand_alpha (s);
 	dia = expand_alpha (d);
 	dia = negate (dia);
 	s = pix_add_mul (s, dia, d, sa);
-	*dest = store8888 (s);
+	store8888 (dest, s);
 
 	++dest;
 	++src;
@@ -688,16 +969,16 @@ mmx_combine_xor_u (pixman_implementation_t *imp,
 
     while (dest < end)
     {
-	__m64 s, dia, d, sia;
+	__m64 dia, d, sia;
+	__m64 s = combine (src, mask);
 
-	s = load8888 (combine (src, mask));
-	d = load8888 (*dest);
+	d = load8888 (dest);
 	sia = expand_alpha (s);
 	dia = expand_alpha (d);
 	sia = negate (sia);
 	dia = negate (dia);
 	s = pix_add_mul (s, dia, d, sia);
-	*dest = store8888 (s);
+	store8888 (dest, s);
 
 	++dest;
 	++src;
@@ -719,12 +1000,12 @@ mmx_combine_add_u (pixman_implementation_t *imp,
 
     while (dest < end)
     {
-	__m64 s, d;
+	__m64 d;
+	__m64 s = combine (src, mask);
 
-	s = load8888 (combine (src, mask));
-	d = load8888 (*dest);
+	d = load8888 (dest);
 	s = pix_add (s, d);
-	*dest = store8888 (s);
+	store8888 (dest, s);
 
 	++dest;
 	++src;
@@ -746,22 +1027,25 @@ mmx_combine_saturate_u (pixman_implementation_t *imp,
 
     while (dest < end)
     {
-	uint32_t s = combine (src, mask);
+	uint32_t s, sa, da;
 	uint32_t d = *dest;
-	__m64 ms = load8888 (s);
-	__m64 md = load8888 (d);
-	uint32_t sa = s >> 24;
-	uint32_t da = ~d >> 24;
+	__m64 ms = combine (src, mask);
+	__m64 md = load8888 (dest);
+
+	store8888(&s, ms);
+	da = ~d >> 24;
+	sa = s >> 24;
 
 	if (sa > da)
 	{
-	    __m64 msa = load8888 (DIV_UN8 (da, sa) << 24);
+	    uint32_t quot = DIV_UN8 (da, sa) << 24;
+	    __m64 msa = load8888 (&quot);
 	    msa = expand_alpha (msa);
 	    ms = pix_multiply (ms, msa);
 	}
 
 	md = pix_add (md, ms);
-	*dest = store8888 (md);
+	store8888 (dest, md);
 
 	++src;
 	++dest;
@@ -783,11 +1067,11 @@ mmx_combine_src_ca (pixman_implementation_t *imp,
 
     while (src < end)
     {
-	__m64 a = load8888 (*mask);
-	__m64 s = load8888 (*src);
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
 
 	s = pix_multiply (s, a);
-	*dest = store8888 (s);
+	store8888 (dest, s);
 
 	++src;
 	++mask;
@@ -808,12 +1092,12 @@ mmx_combine_over_ca (pixman_implementation_t *imp,
 
     while (src < end)
     {
-	__m64 a = load8888 (*mask);
-	__m64 s = load8888 (*src);
-	__m64 d = load8888 (*dest);
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
 	__m64 sa = expand_alpha (s);
 
-	*dest = store8888 (in_over (s, sa, a, d));
+	store8888 (dest, in_over (s, sa, a, d));
 
 	++src;
 	++dest;
@@ -834,12 +1118,12 @@ mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
 
     while (src < end)
     {
-	__m64 a = load8888 (*mask);
-	__m64 s = load8888 (*src);
-	__m64 d = load8888 (*dest);
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
 	__m64 da = expand_alpha (d);
 
-	*dest = store8888 (over (d, da, in (s, a)));
+	store8888 (dest, over (d, da, in (s, a)));
 
 	++src;
 	++dest;
@@ -860,14 +1144,14 @@ mmx_combine_in_ca (pixman_implementation_t *imp,
 
     while (src < end)
     {
-	__m64 a = load8888 (*mask);
-	__m64 s = load8888 (*src);
-	__m64 d = load8888 (*dest);
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
 	__m64 da = expand_alpha (d);
 
 	s = pix_multiply (s, a);
 	s = pix_multiply (s, da);
-	*dest = store8888 (s);
+	store8888 (dest, s);
 
 	++src;
 	++dest;
@@ -888,14 +1172,14 @@ mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
 
     while (src < end)
     {
-	__m64 a = load8888 (*mask);
-	__m64 s = load8888 (*src);
-	__m64 d = load8888 (*dest);
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
 	__m64 sa = expand_alpha (s);
 
 	a = pix_multiply (a, sa);
 	d = pix_multiply (d, a);
-	*dest = store8888 (d);
+	store8888 (dest, d);
 
 	++src;
 	++dest;
@@ -916,15 +1200,15 @@ mmx_combine_out_ca (pixman_implementation_t *imp,
 
     while (src < end)
     {
-	__m64 a = load8888 (*mask);
-	__m64 s = load8888 (*src);
-	__m64 d = load8888 (*dest);
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
 	__m64 da = expand_alpha (d);
 
 	da = negate (da);
 	s = pix_multiply (s, a);
 	s = pix_multiply (s, da);
-	*dest = store8888 (s);
+	store8888 (dest, s);
 
 	++src;
 	++dest;
@@ -945,15 +1229,15 @@ mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
 
     while (src < end)
     {
-	__m64 a = load8888 (*mask);
-	__m64 s = load8888 (*src);
-	__m64 d = load8888 (*dest);
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
 	__m64 sa = expand_alpha (s);
 
 	a = pix_multiply (a, sa);
 	a = negate (a);
 	d = pix_multiply (d, a);
-	*dest = store8888 (d);
+	store8888 (dest, d);
 
 	++src;
 	++dest;
@@ -974,9 +1258,9 @@ mmx_combine_atop_ca (pixman_implementation_t *imp,
 
     while (src < end)
     {
-	__m64 a = load8888 (*mask);
-	__m64 s = load8888 (*src);
-	__m64 d = load8888 (*dest);
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
 	__m64 da = expand_alpha (d);
 	__m64 sa = expand_alpha (s);
 
@@ -984,7 +1268,7 @@ mmx_combine_atop_ca (pixman_implementation_t *imp,
 	a = pix_multiply (a, sa);
 	a = negate (a);
 	d = pix_add_mul (d, a, s, da);
-	*dest = store8888 (d);
+	store8888 (dest, d);
 
 	++src;
 	++dest;
@@ -1005,9 +1289,9 @@ mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
 
     while (src < end)
     {
-	__m64 a = load8888 (*mask);
-	__m64 s = load8888 (*src);
-	__m64 d = load8888 (*dest);
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
 	__m64 da = expand_alpha (d);
 	__m64 sa = expand_alpha (s);
 
@@ -1015,7 +1299,7 @@ mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
 	a = pix_multiply (a, sa);
 	da = negate (da);
 	d = pix_add_mul (d, a, s, da);
-	*dest = store8888 (d);
+	store8888 (dest, d);
 
 	++src;
 	++dest;
@@ -1036,9 +1320,9 @@ mmx_combine_xor_ca (pixman_implementation_t *imp,
 
     while (src < end)
     {
-	__m64 a = load8888 (*mask);
-	__m64 s = load8888 (*src);
-	__m64 d = load8888 (*dest);
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
 	__m64 da = expand_alpha (d);
 	__m64 sa = expand_alpha (s);
 
@@ -1047,7 +1331,7 @@ mmx_combine_xor_ca (pixman_implementation_t *imp,
 	da = negate (da);
 	a = negate (a);
 	d = pix_add_mul (d, a, s, da);
-	*dest = store8888 (d);
+	store8888 (dest, d);
 
 	++src;
 	++dest;
@@ -1068,13 +1352,13 @@ mmx_combine_add_ca (pixman_implementation_t *imp,
 
     while (src < end)
     {
-	__m64 a = load8888 (*mask);
-	__m64 s = load8888 (*src);
-	__m64 d = load8888 (*dest);
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
 
 	s = pix_multiply (s, a);
 	d = pix_add (s, d);
-	*dest = store8888 (d);
+	store8888 (dest, d);
 
 	++src;
 	++dest;
@@ -1087,19 +1371,9 @@ mmx_combine_add_ca (pixman_implementation_t *imp,
 
 static void
 mmx_composite_over_n_8888 (pixman_implementation_t *imp,
-                           pixman_op_t              op,
-                           pixman_image_t *         src_image,
-                           pixman_image_t *         mask_image,
-                           pixman_image_t *         dst_image,
-                           int32_t                  src_x,
-                           int32_t                  src_y,
-                           int32_t                  mask_x,
-                           int32_t                  mask_y,
-                           int32_t                  dest_x,
-                           int32_t                  dest_y,
-                           int32_t                  width,
-                           int32_t                  height)
+                           pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t src;
     uint32_t    *dst_line, *dst;
     int32_t w;
@@ -1108,14 +1382,14 @@ mmx_composite_over_n_8888 (pixman_implementation_t *imp,
 
     CHECKPOINT ();
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     if (src == 0)
 	return;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 
-    vsrc = load8888 (src);
+    vsrc = load8888 (&src);
     vsrca = expand_alpha (vsrc);
 
     while (height--)
@@ -1126,9 +1400,9 @@ mmx_composite_over_n_8888 (pixman_implementation_t *imp,
 
 	CHECKPOINT ();
 
-	while (w && (unsigned long)dst & 7)
+	while (w && (uintptr_t)dst & 7)
 	{
-	    *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
+	    store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
 
 	    w--;
 	    dst++;
@@ -1152,12 +1426,9 @@ mmx_composite_over_n_8888 (pixman_implementation_t *imp,
 
 	CHECKPOINT ();
 
-	while (w)
+	if (w)
 	{
-	    *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
-
-	    w--;
-	    dst++;
+	    store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
 	}
     }
 
@@ -1166,19 +1437,9 @@ mmx_composite_over_n_8888 (pixman_implementation_t *imp,
 
 static void
 mmx_composite_over_n_0565 (pixman_implementation_t *imp,
-                           pixman_op_t              op,
-                           pixman_image_t *         src_image,
-                           pixman_image_t *         mask_image,
-                           pixman_image_t *         dst_image,
-                           int32_t                  src_x,
-                           int32_t                  src_y,
-                           int32_t                  mask_x,
-                           int32_t                  mask_y,
-                           int32_t                  dest_x,
-                           int32_t                  dest_y,
-                           int32_t                  width,
-                           int32_t                  height)
+                           pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t src;
     uint16_t    *dst_line, *dst;
     int32_t w;
@@ -1187,14 +1448,14 @@ mmx_composite_over_n_0565 (pixman_implementation_t *imp,
 
     CHECKPOINT ();
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     if (src == 0)
 	return;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
 
-    vsrc = load8888 (src);
+    vsrc = load8888 (&src);
     vsrca = expand_alpha (vsrc);
 
     while (height--)
@@ -1205,7 +1466,7 @@ mmx_composite_over_n_0565 (pixman_implementation_t *imp,
 
 	CHECKPOINT ();
 
-	while (w && (unsigned long)dst & 7)
+	while (w && (uintptr_t)dst & 7)
 	{
 	    uint64_t d = *dst;
 	    __m64 vdest = expand565 (to_m64 (d), 0);
@@ -1219,16 +1480,17 @@ mmx_composite_over_n_0565 (pixman_implementation_t *imp,
 
 	while (w >= 4)
 	{
-	    __m64 vdest;
+	    __m64 vdest = *(__m64 *)dst;
+	    __m64 v0, v1, v2, v3;
 
-	    vdest = *(__m64 *)dst;
+	    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
 
-	    vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 0)), vdest, 0);
-	    vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 1)), vdest, 1);
-	    vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 2)), vdest, 2);
-	    vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 3)), vdest, 3);
+	    v0 = over (vsrc, vsrca, v0);
+	    v1 = over (vsrc, vsrca, v1);
+	    v2 = over (vsrc, vsrca, v2);
+	    v3 = over (vsrc, vsrca, v3);
 
-	    *(__m64 *)dst = vdest;
+	    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
 
 	    dst += 4;
 	    w -= 4;
@@ -1254,20 +1516,10 @@ mmx_composite_over_n_0565 (pixman_implementation_t *imp,
 
 static void
 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
-                                   pixman_op_t              op,
-                                   pixman_image_t *         src_image,
-                                   pixman_image_t *         mask_image,
-                                   pixman_image_t *         dst_image,
-                                   int32_t                  src_x,
-                                   int32_t                  src_y,
-                                   int32_t                  mask_x,
-                                   int32_t                  mask_y,
-                                   int32_t                  dest_x,
-                                   int32_t                  dest_y,
-                                   int32_t                  width,
-                                   int32_t                  height)
+                                   pixman_composite_info_t *info)
 {
-    uint32_t src, srca;
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
     uint32_t    *dst_line;
     uint32_t    *mask_line;
     int dst_stride, mask_stride;
@@ -1275,16 +1527,15 @@ mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 
     CHECKPOINT ();
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
-    srca = src >> 24;
     if (src == 0)
 	return;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
 
-    vsrc = load8888 (src);
+    vsrc = load8888 (&src);
     vsrca = expand_alpha (vsrc);
 
     while (height--)
@@ -1293,15 +1544,15 @@ mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 	uint32_t *p = (uint32_t *)mask_line;
 	uint32_t *q = (uint32_t *)dst_line;
 
-	while (twidth && (unsigned long)q & 7)
+	while (twidth && (uintptr_t)q & 7)
 	{
 	    uint32_t m = *(uint32_t *)p;
 
 	    if (m)
 	    {
-		__m64 vdest = load8888 (*q);
-		vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
-		*q = store8888 (vdest);
+		__m64 vdest = load8888 (q);
+		vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
+		store8888 (q, vdest);
 	    }
 
 	    twidth--;
@@ -1320,9 +1571,9 @@ mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 		__m64 dest0, dest1;
 		__m64 vdest = *(__m64 *)q;
 
-		dest0 = in_over (vsrc, vsrca, load8888 (m0),
+		dest0 = in_over (vsrc, vsrca, load8888 (&m0),
 		                 expand8888 (vdest, 0));
-		dest1 = in_over (vsrc, vsrca, load8888 (m1),
+		dest1 = in_over (vsrc, vsrca, load8888 (&m1),
 		                 expand8888 (vdest, 1));
 
 		*(__m64 *)q = pack8888 (dest0, dest1);
@@ -1333,15 +1584,15 @@ mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 	    twidth -= 2;
 	}
 
-	while (twidth)
+	if (twidth)
 	{
 	    uint32_t m = *(uint32_t *)p;
 
 	    if (m)
 	    {
-		__m64 vdest = load8888 (*q);
-		vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
-		*q = store8888 (vdest);
+		__m64 vdest = load8888 (q);
+		vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
+		store8888 (q, vdest);
 	    }
 
 	    twidth--;
@@ -1358,37 +1609,23 @@ mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 
 static void
 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
-                                pixman_op_t              op,
-                                pixman_image_t *         src_image,
-                                pixman_image_t *         mask_image,
-                                pixman_image_t *         dst_image,
-                                int32_t                  src_x,
-                                int32_t                  src_y,
-                                int32_t                  mask_x,
-                                int32_t                  mask_y,
-                                int32_t                  dest_x,
-                                int32_t                  dest_y,
-                                int32_t                  width,
-                                int32_t                  height)
+                                pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t    *dst_line, *dst;
     uint32_t    *src_line, *src;
     uint32_t mask;
     __m64 vmask;
     int dst_stride, src_stride;
     int32_t w;
-    __m64 srca;
 
     CHECKPOINT ();
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
-    mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
-    mask &= 0xff000000;
-    mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
-    vmask = load8888 (mask);
-    srca = MC (4x00ff);
+    mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
+    vmask = expand_alpha (load8888 (&mask));
 
     while (height--)
     {
@@ -1398,12 +1635,12 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	while (w && (unsigned long)dst & 7)
+	while (w && (uintptr_t)dst & 7)
 	{
-	    __m64 s = load8888 (*src);
-	    __m64 d = load8888 (*dst);
+	    __m64 s = load8888 (src);
+	    __m64 d = load8888 (dst);
 
-	    *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
+	    store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
 
 	    w--;
 	    dst++;
@@ -1412,7 +1649,7 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
 
 	while (w >= 2)
 	{
-	    __m64 vs = *(__m64 *)src;
+	    __m64 vs = ldq_u ((__m64 *)src);
 	    __m64 vd = *(__m64 *)dst;
 	    __m64 vsrc0 = expand8888 (vs, 0);
 	    __m64 vsrc1 = expand8888 (vs, 1);
@@ -1426,16 +1663,12 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
 	    src += 2;
 	}
 
-	while (w)
+	if (w)
 	{
-	    __m64 s = load8888 (*src);
-	    __m64 d = load8888 (*dst);
+	    __m64 s = load8888 (src);
+	    __m64 d = load8888 (dst);
 
-	    *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
-
-	    w--;
-	    dst++;
-	    src++;
+	    store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
 	}
     }
 
@@ -1444,19 +1677,9 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
 
 static void
 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
-                                pixman_op_t              op,
-                                pixman_image_t *         src_image,
-                                pixman_image_t *         mask_image,
-                                pixman_image_t *         dst_image,
-                                int32_t                  src_x,
-                                int32_t                  src_y,
-                                int32_t                  mask_x,
-                                int32_t                  mask_y,
-                                int32_t                  dest_x,
-                                int32_t                  dest_y,
-                                int32_t                  width,
-                                int32_t                  height)
+                                pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t *dst_line, *dst;
     uint32_t *src_line, *src;
     uint32_t mask;
@@ -1467,13 +1690,11 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 
     CHECKPOINT ();
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-    mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
+    mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
 
-    mask &= 0xff000000;
-    mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
-    vmask = load8888 (mask);
+    vmask = expand_alpha (load8888 (&mask));
     srca = MC (4x00ff);
 
     while (height--)
@@ -1484,12 +1705,13 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	while (w && (unsigned long)dst & 7)
+	while (w && (uintptr_t)dst & 7)
 	{
-	    __m64 s = load8888 (*src | 0xff000000);
-	    __m64 d = load8888 (*dst);
+	    uint32_t ssrc = *src | 0xff000000;
+	    __m64 s = load8888 (&ssrc);
+	    __m64 d = load8888 (dst);
 
-	    *dst = store8888 (in_over (s, srca, vmask, d));
+	    store8888 (dst, in_over (s, srca, vmask, d));
 
 	    w--;
 	    dst++;
@@ -1507,14 +1729,14 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 	    __m64 vd6 = *(__m64 *)(dst + 12);
 	    __m64 vd7 = *(__m64 *)(dst + 14);
 
-	    __m64 vs0 = *(__m64 *)(src + 0);
-	    __m64 vs1 = *(__m64 *)(src + 2);
-	    __m64 vs2 = *(__m64 *)(src + 4);
-	    __m64 vs3 = *(__m64 *)(src + 6);
-	    __m64 vs4 = *(__m64 *)(src + 8);
-	    __m64 vs5 = *(__m64 *)(src + 10);
-	    __m64 vs6 = *(__m64 *)(src + 12);
-	    __m64 vs7 = *(__m64 *)(src + 14);
+	    __m64 vs0 = ldq_u ((__m64 *)(src + 0));
+	    __m64 vs1 = ldq_u ((__m64 *)(src + 2));
+	    __m64 vs2 = ldq_u ((__m64 *)(src + 4));
+	    __m64 vs3 = ldq_u ((__m64 *)(src + 6));
+	    __m64 vs4 = ldq_u ((__m64 *)(src + 8));
+	    __m64 vs5 = ldq_u ((__m64 *)(src + 10));
+	    __m64 vs6 = ldq_u ((__m64 *)(src + 12));
+	    __m64 vs7 = ldq_u ((__m64 *)(src + 14));
 
 	    vd0 = pack8888 (
 	        in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
@@ -1564,10 +1786,11 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 
 	while (w)
 	{
-	    __m64 s = load8888 (*src | 0xff000000);
-	    __m64 d = load8888 (*dst);
+	    uint32_t ssrc = *src | 0xff000000;
+	    __m64 s = load8888 (&ssrc);
+	    __m64 d = load8888 (dst);
 
-	    *dst = store8888 (in_over (s, srca, vmask, d));
+	    store8888 (dst, in_over (s, srca, vmask, d));
 
 	    w--;
 	    dst++;
@@ -1580,19 +1803,9 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 
 static void
 mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
+                              pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t *dst_line, *dst;
     uint32_t *src_line, *src;
     uint32_t s;
@@ -1602,7 +1815,7 @@ mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
 
     CHECKPOINT ();
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
     while (height--)
@@ -1625,9 +1838,9 @@ mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
 	    else if (s)
 	    {
 		__m64 ms, sa;
-		ms = load8888 (s);
+		ms = load8888 (&s);
 		sa = expand_alpha (ms);
-		*dst = store8888 (over (ms, sa, load8888 (*dst)));
+		store8888 (dst, over (ms, sa, load8888 (dst)));
 	    }
 
 	    dst++;
@@ -1638,19 +1851,9 @@ mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
 
 static void
 mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
+                              pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint16_t    *dst_line, *dst;
     uint32_t    *src_line, *src;
     int dst_stride, src_stride;
@@ -1658,7 +1861,7 @@ mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
 
     CHECKPOINT ();
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
 #if 0
@@ -1676,9 +1879,9 @@ mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
 
 	CHECKPOINT ();
 
-	while (w && (unsigned long)dst & 7)
+	while (w && (uintptr_t)dst & 7)
 	{
-	    __m64 vsrc = load8888 (*src);
+	    __m64 vsrc = load8888 (src);
 	    uint64_t d = *dst;
 	    __m64 vdest = expand565 (to_m64 (d), 0);
 
@@ -1696,22 +1899,23 @@ mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
 
 	while (w >= 4)
 	{
+	    __m64 vdest = *(__m64 *)dst;
+	    __m64 v0, v1, v2, v3;
 	    __m64 vsrc0, vsrc1, vsrc2, vsrc3;
-	    __m64 vdest;
 
-	    vsrc0 = load8888 (*(src + 0));
-	    vsrc1 = load8888 (*(src + 1));
-	    vsrc2 = load8888 (*(src + 2));
-	    vsrc3 = load8888 (*(src + 3));
+	    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
 
-	    vdest = *(__m64 *)dst;
+	    vsrc0 = load8888 ((src + 0));
+	    vsrc1 = load8888 ((src + 1));
+	    vsrc2 = load8888 ((src + 2));
+	    vsrc3 = load8888 ((src + 3));
 
-	    vdest = pack_565 (over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0)), vdest, 0);
-	    vdest = pack_565 (over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1)), vdest, 1);
-	    vdest = pack_565 (over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2)), vdest, 2);
-	    vdest = pack_565 (over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3)), vdest, 3);
+	    v0 = over (vsrc0, expand_alpha (vsrc0), v0);
+	    v1 = over (vsrc1, expand_alpha (vsrc1), v1);
+	    v2 = over (vsrc2, expand_alpha (vsrc2), v2);
+	    v3 = over (vsrc3, expand_alpha (vsrc3), v3);
 
-	    *(__m64 *)dst = vdest;
+	    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
 
 	    w -= 4;
 	    dst += 4;
@@ -1722,7 +1926,7 @@ mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
 
 	while (w)
 	{
-	    __m64 vsrc = load8888 (*src);
+	    __m64 vsrc = load8888 (src);
 	    uint64_t d = *dst;
 	    __m64 vdest = expand565 (to_m64 (d), 0);
 
@@ -1741,19 +1945,9 @@ mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
 
 static void
 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             pixman_image_t *         src_image,
-                             pixman_image_t *         mask_image,
-                             pixman_image_t *         dst_image,
-                             int32_t                  src_x,
-                             int32_t                  src_y,
-                             int32_t                  mask_x,
-                             int32_t                  mask_y,
-                             int32_t                  dest_x,
-                             int32_t                  dest_y,
-                             int32_t                  width,
-                             int32_t                  height)
+                             pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t src, srca;
     uint32_t *dst_line, *dst;
     uint8_t *mask_line, *mask;
@@ -1764,7 +1958,7 @@ mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
 
     CHECKPOINT ();
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
@@ -1772,10 +1966,10 @@ mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
 
     srcsrc = (uint64_t)src << 32 | src;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
-    vsrc = load8888 (src);
+    vsrc = load8888 (&src);
     vsrca = expand_alpha (vsrc);
 
     while (height--)
@@ -1788,7 +1982,7 @@ mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
 
 	CHECKPOINT ();
 
-	while (w && (unsigned long)dst & 7)
+	while (w && (uintptr_t)dst & 7)
 	{
 	    uint64_t m = *mask;
 
@@ -1796,9 +1990,9 @@ mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
 	    {
 		__m64 vdest = in_over (vsrc, vsrca,
 				       expand_alpha_rev (to_m64 (m)),
-				       load8888 (*dst));
+				       load8888 (dst));
 
-		*dst = store8888 (vdest);
+		store8888 (dst, vdest);
 	    }
 
 	    w--;
@@ -1841,44 +2035,41 @@ mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
 
 	CHECKPOINT ();
 
-	while (w)
+	if (w)
 	{
 	    uint64_t m = *mask;
 
 	    if (m)
 	    {
-		__m64 vdest = load8888 (*dst);
+		__m64 vdest = load8888 (dst);
 
 		vdest = in_over (
 		    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
-		*dst = store8888 (vdest);
+		store8888 (dst, vdest);
 	    }
-
-	    w--;
-	    mask++;
-	    dst++;
 	}
     }
 
     _mm_empty ();
 }
 
-pixman_bool_t
-pixman_fill_mmx (uint32_t *bits,
-                 int       stride,
-                 int       bpp,
-                 int       x,
-                 int       y,
-                 int       width,
-                 int       height,
-                 uint32_t xor)
+static pixman_bool_t
+mmx_fill (pixman_implementation_t *imp,
+          uint32_t *               bits,
+          int                      stride,
+          int                      bpp,
+          int                      x,
+          int                      y,
+          int                      width,
+          int                      height,
+          uint32_t		   filler)
 {
     uint64_t fill;
     __m64 vfill;
     uint32_t byte_width;
     uint8_t     *byte_line;
 
-#ifdef __GNUC__
+#if defined __GNUC__ && defined USE_X86_MMX
     __m64 v1, v2, v3, v4, v5, v6, v7;
 #endif
 
@@ -1891,7 +2082,7 @@ pixman_fill_mmx (uint32_t *bits,
 	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
 	byte_width = width;
 	stride *= 1;
-        xor = (xor & 0xff) * 0x01010101;
+        filler = (filler & 0xff) * 0x01010101;
     }
     else if (bpp == 16)
     {
@@ -1899,7 +2090,7 @@ pixman_fill_mmx (uint32_t *bits,
 	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
 	byte_width = 2 * width;
 	stride *= 2;
-        xor = (xor & 0xffff) * 0x00010001;
+        filler = (filler & 0xffff) * 0x00010001;
     }
     else
     {
@@ -1909,10 +2100,10 @@ pixman_fill_mmx (uint32_t *bits,
 	stride *= 4;
     }
 
-    fill = ((uint64_t)xor << 32) | xor;
+    fill = ((uint64_t)filler << 32) | filler;
     vfill = to_m64 (fill);
 
-#ifdef __GNUC__
+#if defined __GNUC__ && defined USE_X86_MMX
     __asm__ (
         "movq		%7,	%0\n"
         "movq		%7,	%1\n"
@@ -1934,23 +2125,23 @@ pixman_fill_mmx (uint32_t *bits,
 	byte_line += stride;
 	w = byte_width;
 
-	while (w >= 1 && ((unsigned long)d & 1))
+	if (w >= 1 && ((uintptr_t)d & 1))
 	{
-	    *(uint8_t *)d = (xor & 0xff);
+	    *(uint8_t *)d = (filler & 0xff);
 	    w--;
 	    d++;
 	}
 
-	while (w >= 2 && ((unsigned long)d & 3))
+	if (w >= 2 && ((uintptr_t)d & 3))
 	{
-	    *(uint16_t *)d = xor;
+	    *(uint16_t *)d = filler;
 	    w -= 2;
 	    d += 2;
 	}
 
-	while (w >= 4 && ((unsigned long)d & 7))
+	while (w >= 4 && ((uintptr_t)d & 7))
 	{
-	    *(uint32_t *)d = xor;
+	    *(uint32_t *)d = filler;
 
 	    w -= 4;
 	    d += 4;
@@ -1958,7 +2149,7 @@ pixman_fill_mmx (uint32_t *bits,
 
 	while (w >= 64)
 	{
-#ifdef __GNUC__
+#if defined __GNUC__ && defined USE_X86_MMX
 	    __asm__ (
 	        "movq	%1,	  (%0)\n"
 	        "movq	%2,	 8(%0)\n"
@@ -1989,20 +2180,20 @@ pixman_fill_mmx (uint32_t *bits,
 
 	while (w >= 4)
 	{
-	    *(uint32_t *)d = xor;
+	    *(uint32_t *)d = filler;
 
 	    w -= 4;
 	    d += 4;
 	}
-	while (w >= 2)
+	if (w >= 2)
 	{
-	    *(uint16_t *)d = xor;
+	    *(uint16_t *)d = filler;
 	    w -= 2;
 	    d += 2;
 	}
-	while (w >= 1)
+	if (w >= 1)
 	{
-	    *(uint8_t *)d = (xor & 0xff);
+	    *(uint8_t *)d = (filler & 0xff);
 	    w--;
 	    d++;
 	}
@@ -2014,48 +2205,93 @@ pixman_fill_mmx (uint32_t *bits,
 }
 
 static void
-mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
-                            pixman_op_t              op,
-                            pixman_image_t *         src_image,
-                            pixman_image_t *         mask_image,
-                            pixman_image_t *         dst_image,
-                            int32_t                  src_x,
-                            int32_t                  src_y,
-                            int32_t                  mask_x,
-                            int32_t                  mask_y,
-                            int32_t                  dest_x,
-                            int32_t                  dest_y,
-                            int32_t                  width,
-                            int32_t                  height)
+mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
+                             pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 7)
+	{
+	    s = *src++;
+	    *dst = convert_8888_to_0565 (s);
+	    dst++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    __m64 vdest;
+	    __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
+	    __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
+
+	    vdest = pack_4xpacked565 (vsrc0, vsrc1);
+
+	    *(__m64 *)dst = vdest;
+
+	    w -= 4;
+	    src += 4;
+	    dst += 4;
+	}
+
+	while (w)
+	{
+	    s = *src++;
+	    *dst = convert_8888_to_0565 (s);
+	    dst++;
+	    w--;
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
+                            pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t src, srca;
     uint32_t    *dst_line, *dst;
     uint8_t     *mask_line, *mask;
     int dst_stride, mask_stride;
     int32_t w;
-    __m64 vsrc, vsrca;
+    __m64 vsrc;
     uint64_t srcsrc;
 
     CHECKPOINT ();
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
     {
-	pixman_fill_mmx (dst_image->bits.bits, dst_image->bits.rowstride,
-			 PIXMAN_FORMAT_BPP (dst_image->bits.format),
-	                 dest_x, dest_y, width, height, 0);
+	mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
+		  PIXMAN_FORMAT_BPP (dest_image->bits.format),
+		  dest_x, dest_y, width, height, 0);
 	return;
     }
 
     srcsrc = (uint64_t)src << 32 | src;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
-    vsrc = load8888 (src);
-    vsrca = expand_alpha (vsrc);
+    vsrc = load8888 (&src);
 
     while (height--)
     {
@@ -2067,7 +2303,7 @@ mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
 
 	CHECKPOINT ();
 
-	while (w && (unsigned long)dst & 7)
+	while (w && (uintptr_t)dst & 7)
 	{
 	    uint64_t m = *mask;
 
@@ -2075,7 +2311,7 @@ mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
 	    {
 		__m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
 
-		*dst = store8888 (vdest);
+		store8888 (dst, vdest);
 	    }
 	    else
 	    {
@@ -2101,11 +2337,8 @@ mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
 	    }
 	    else if (m0 | m1)
 	    {
-		__m64 vdest;
 		__m64 dest0, dest1;
 
-		vdest = *(__m64 *)dst;
-
 		dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
 		dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
 
@@ -2123,25 +2356,21 @@ mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
 
 	CHECKPOINT ();
 
-	while (w)
+	if (w)
 	{
 	    uint64_t m = *mask;
 
 	    if (m)
 	    {
-		__m64 vdest = load8888 (*dst);
+		__m64 vdest = load8888 (dst);
 
 		vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
-		*dst = store8888 (vdest);
+		store8888 (dst, vdest);
 	    }
 	    else
 	    {
 		*dst = 0;
 	    }
-
-	    w--;
-	    mask++;
-	    dst++;
 	}
     }
 
@@ -2150,47 +2379,33 @@ mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
 
 static void
 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             pixman_image_t *         src_image,
-                             pixman_image_t *         mask_image,
-                             pixman_image_t *         dst_image,
-                             int32_t                  src_x,
-                             int32_t                  src_y,
-                             int32_t                  mask_x,
-                             int32_t                  mask_y,
-                             int32_t                  dest_x,
-                             int32_t                  dest_y,
-                             int32_t                  width,
-                             int32_t                  height)
+                             pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t src, srca;
     uint16_t *dst_line, *dst;
     uint8_t *mask_line, *mask;
     int dst_stride, mask_stride;
     int32_t w;
     __m64 vsrc, vsrca, tmp;
-    uint64_t srcsrcsrcsrc, src16;
+    __m64 srcsrcsrcsrc;
 
     CHECKPOINT ();
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
 	return;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
-    vsrc = load8888 (src);
+    vsrc = load8888 (&src);
     vsrca = expand_alpha (vsrc);
 
     tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
-    src16 = to_uint64 (tmp);
-
-    srcsrcsrcsrc =
-	(uint64_t)src16 << 48 | (uint64_t)src16 << 32 |
-	(uint64_t)src16 << 16 | (uint64_t)src16;
+    srcsrcsrcsrc = expand_alpha_rev (tmp);
 
     while (height--)
     {
@@ -2202,7 +2417,7 @@ mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
 
 	CHECKPOINT ();
 
-	while (w && (unsigned long)dst & 7)
+	while (w && (uintptr_t)dst & 7)
 	{
 	    uint64_t m = *mask;
 
@@ -2234,29 +2449,29 @@ mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
 
 	    if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
 	    {
-		*(uint64_t *)dst = srcsrcsrcsrc;
+		*(__m64 *)dst = srcsrcsrcsrc;
 	    }
 	    else if (m0 | m1 | m2 | m3)
 	    {
-		__m64 vdest;
+		__m64 vdest = *(__m64 *)dst;
+		__m64 v0, v1, v2, v3;
 		__m64 vm0, vm1, vm2, vm3;
 
-		vdest = *(__m64 *)dst;
+		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
 
 		vm0 = to_m64 (m0);
-		vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm0),
-					   expand565 (vdest, 0)), vdest, 0);
-		vm1 = to_m64 (m1);
-		vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm1),
-					   expand565 (vdest, 1)), vdest, 1);
-		vm2 = to_m64 (m2);
-		vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm2),
-					   expand565 (vdest, 2)), vdest, 2);
-		vm3 = to_m64 (m3);
-		vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm3),
-					   expand565 (vdest, 3)), vdest, 3);
+		v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
 
-		*(__m64 *)dst = vdest;
+		vm1 = to_m64 (m1);
+		v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
+
+		vm2 = to_m64 (m2);
+		v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
+
+		vm3 = to_m64 (m3);
+		v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
+
+		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
 	    }
 
 	    w -= 4;
@@ -2291,19 +2506,9 @@ mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
 
 static void
 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
-                                pixman_op_t              op,
-                                pixman_image_t *         src_image,
-                                pixman_image_t *         mask_image,
-                                pixman_image_t *         dst_image,
-                                int32_t                  src_x,
-                                int32_t                  src_y,
-                                int32_t                  mask_x,
-                                int32_t                  mask_y,
-                                int32_t                  dest_x,
-                                int32_t                  dest_y,
-                                int32_t                  width,
-                                int32_t                  height)
+                                pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint16_t    *dst_line, *dst;
     uint32_t    *src_line, *src;
     int dst_stride, src_stride;
@@ -2311,7 +2516,7 @@ mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
 
     CHECKPOINT ();
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
 #if 0
@@ -2329,9 +2534,9 @@ mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
 
 	CHECKPOINT ();
 
-	while (w && (unsigned long)dst & 7)
+	while (w && (uintptr_t)dst & 7)
 	{
-	    __m64 vsrc = load8888 (*src);
+	    __m64 vsrc = load8888 (src);
 	    uint64_t d = *dst;
 	    __m64 vdest = expand565 (to_m64 (d), 0);
 
@@ -2363,24 +2568,31 @@ mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
 
 	    if ((a0 & a1 & a2 & a3) == 0xFF)
 	    {
-		__m64 vdest;
-		vdest = pack_565 (invert_colors (load8888 (s0)), _mm_setzero_si64 (), 0);
-		vdest = pack_565 (invert_colors (load8888 (s1)), vdest, 1);
-		vdest = pack_565 (invert_colors (load8888 (s2)), vdest, 2);
-		vdest = pack_565 (invert_colors (load8888 (s3)), vdest, 3);
+		__m64 v0 = invert_colors (load8888 (&s0));
+		__m64 v1 = invert_colors (load8888 (&s1));
+		__m64 v2 = invert_colors (load8888 (&s2));
+		__m64 v3 = invert_colors (load8888 (&s3));
 
-		*(__m64 *)dst = vdest;
+		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
 	    }
 	    else if (s0 | s1 | s2 | s3)
 	    {
 		__m64 vdest = *(__m64 *)dst;
+		__m64 v0, v1, v2, v3;
 
-		vdest = pack_565 (over_rev_non_pre (load8888 (s0), expand565 (vdest, 0)), vdest, 0);
-		vdest = pack_565 (over_rev_non_pre (load8888 (s1), expand565 (vdest, 1)), vdest, 1);
-		vdest = pack_565 (over_rev_non_pre (load8888 (s2), expand565 (vdest, 2)), vdest, 2);
-		vdest = pack_565 (over_rev_non_pre (load8888 (s3), expand565 (vdest, 3)), vdest, 3);
+		__m64 vsrc0 = load8888 (&s0);
+		__m64 vsrc1 = load8888 (&s1);
+		__m64 vsrc2 = load8888 (&s2);
+		__m64 vsrc3 = load8888 (&s3);
 
-		*(__m64 *)dst = vdest;
+		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
+
+		v0 = over_rev_non_pre (vsrc0, v0);
+		v1 = over_rev_non_pre (vsrc1, v1);
+		v2 = over_rev_non_pre (vsrc2, v2);
+		v3 = over_rev_non_pre (vsrc3, v3);
+
+		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
 	    }
 
 	    w -= 4;
@@ -2392,7 +2604,7 @@ mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
 
 	while (w)
 	{
-	    __m64 vsrc = load8888 (*src);
+	    __m64 vsrc = load8888 (src);
 	    uint64_t d = *dst;
 	    __m64 vdest = expand565 (to_m64 (d), 0);
 
@@ -2411,19 +2623,9 @@ mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
 
 static void
 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
-                                pixman_op_t              op,
-                                pixman_image_t *         src_image,
-                                pixman_image_t *         mask_image,
-                                pixman_image_t *         dst_image,
-                                int32_t                  src_x,
-                                int32_t                  src_y,
-                                int32_t                  mask_x,
-                                int32_t                  mask_y,
-                                int32_t                  dest_x,
-                                int32_t                  dest_y,
-                                int32_t                  width,
-                                int32_t                  height)
+                                pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t    *dst_line, *dst;
     uint32_t    *src_line, *src;
     int dst_stride, src_stride;
@@ -2431,7 +2633,7 @@ mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 
     CHECKPOINT ();
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
 #if 0
@@ -2447,12 +2649,12 @@ mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	while (w && (unsigned long)dst & 7)
+	while (w && (uintptr_t)dst & 7)
 	{
-	    __m64 s = load8888 (*src);
-	    __m64 d = load8888 (*dst);
+	    __m64 s = load8888 (src);
+	    __m64 d = load8888 (dst);
 
-	    *dst = store8888 (over_rev_non_pre (s, d));
+	    store8888 (dst, over_rev_non_pre (s, d));
 
 	    w--;
 	    dst++;
@@ -2461,7 +2663,7 @@ mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 
 	while (w >= 2)
 	{
-	    uint64_t s0, s1;
+	    uint32_t s0, s1;
 	    unsigned char a0, a1;
 	    __m64 d0, d1;
 
@@ -2473,8 +2675,8 @@ mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 
 	    if ((a0 & a1) == 0xFF)
 	    {
-		d0 = invert_colors (load8888 (s0));
-		d1 = invert_colors (load8888 (s1));
+		d0 = invert_colors (load8888 (&s0));
+		d1 = invert_colors (load8888 (&s1));
 
 		*(__m64 *)dst = pack8888 (d0, d1);
 	    }
@@ -2482,8 +2684,8 @@ mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 	    {
 		__m64 vdest = *(__m64 *)dst;
 
-		d0 = over_rev_non_pre (load8888 (s0), expand8888 (vdest, 0));
-		d1 = over_rev_non_pre (load8888 (s1), expand8888 (vdest, 1));
+		d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
+		d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
 
 		*(__m64 *)dst = pack8888 (d0, d1);
 	    }
@@ -2493,16 +2695,12 @@ mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 	    src += 2;
 	}
 
-	while (w)
+	if (w)
 	{
-	    __m64 s = load8888 (*src);
-	    __m64 d = load8888 (*dst);
+	    __m64 s = load8888 (src);
+	    __m64 d = load8888 (dst);
 
-	    *dst = store8888 (over_rev_non_pre (s, d));
-
-	    w--;
-	    dst++;
-	    src++;
+	    store8888 (dst, over_rev_non_pre (s, d));
 	}
     }
 
@@ -2511,20 +2709,10 @@ mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 
 static void
 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
-                                   pixman_op_t              op,
-                                   pixman_image_t *         src_image,
-                                   pixman_image_t *         mask_image,
-                                   pixman_image_t *         dst_image,
-                                   int32_t                  src_x,
-                                   int32_t                  src_y,
-                                   int32_t                  mask_x,
-                                   int32_t                  mask_y,
-                                   int32_t                  dest_x,
-                                   int32_t                  dest_y,
-                                   int32_t                  width,
-                                   int32_t                  height)
+                                   pixman_composite_info_t *info)
 {
-    uint32_t src, srca;
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
     uint16_t    *dst_line;
     uint32_t    *mask_line;
     int dst_stride, mask_stride;
@@ -2532,16 +2720,15 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 
     CHECKPOINT ();
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
-    srca = src >> 24;
     if (src == 0)
 	return;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
 
-    vsrc = load8888 (src);
+    vsrc = load8888 (&src);
     vsrca = expand_alpha (vsrc);
 
     while (height--)
@@ -2550,7 +2737,7 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 	uint32_t *p = (uint32_t *)mask_line;
 	uint16_t *q = (uint16_t *)dst_line;
 
-	while (twidth && ((unsigned long)q & 7))
+	while (twidth && ((uintptr_t)q & 7))
 	{
 	    uint32_t m = *(uint32_t *)p;
 
@@ -2558,7 +2745,7 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 	    {
 		uint64_t d = *q;
 		__m64 vdest = expand565 (to_m64 (d), 0);
-		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
+		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
 		*q = to_uint64 (vdest);
 	    }
 
@@ -2579,13 +2766,16 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 	    if ((m0 | m1 | m2 | m3))
 	    {
 		__m64 vdest = *(__m64 *)q;
+		__m64 v0, v1, v2, v3;
 
-		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m0), expand565 (vdest, 0)), vdest, 0);
-		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m1), expand565 (vdest, 1)), vdest, 1);
-		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m2), expand565 (vdest, 2)), vdest, 2);
-		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m3), expand565 (vdest, 3)), vdest, 3);
+		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
 
-		*(__m64 *)q = vdest;
+		v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
+		v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
+		v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
+		v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
+
+		*(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
 	    }
 	    twidth -= 4;
 	    p += 4;
@@ -2601,7 +2791,7 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 	    {
 		uint64_t d = *q;
 		__m64 vdest = expand565 (to_m64 (d), 0);
-		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
+		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
 		*q = to_uint64 (vdest);
 	    }
 
@@ -2619,19 +2809,9 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 
 static void
 mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
-                        pixman_op_t              op,
-                        pixman_image_t *         src_image,
-                        pixman_image_t *         mask_image,
-                        pixman_image_t *         dst_image,
-                        int32_t                  src_x,
-                        int32_t                  src_y,
-                        int32_t                  mask_x,
-                        int32_t                  mask_y,
-                        int32_t                  dest_x,
-                        int32_t                  dest_y,
-                        int32_t                  width,
-                        int32_t                  height)
+                        pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint8_t *dst_line, *dst;
     uint8_t *mask_line, *mask;
     int dst_stride, mask_stride;
@@ -2640,14 +2820,14 @@ mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
     uint8_t sa;
     __m64 vsrc, vsrca;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     sa = src >> 24;
 
-    vsrc = load8888 (src);
+    vsrc = load8888 (&src);
     vsrca = expand_alpha (vsrc);
 
     while (height--)
@@ -2658,26 +2838,35 @@ mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	w = width;
 
-	if ((((unsigned long)dst_image & 3) == 0) &&
-	    (((unsigned long)src_image & 3) == 0))
+	while (w && (uintptr_t)dst & 7)
 	{
-	    while (w >= 4)
-	    {
-		uint32_t m;
-		__m64 vmask;
-		__m64 vdest;
+	    uint16_t tmp;
+	    uint8_t a;
+	    uint32_t m, d;
 
-		m = 0;
+	    a = *mask++;
+	    d = *dst;
 
-		vmask = load8888 (*(uint32_t *)mask);
-		vdest = load8888 (*(uint32_t *)dst);
+	    m = MUL_UN8 (sa, a, tmp);
+	    d = MUL_UN8 (m, d, tmp);
 
-		*(uint32_t *)dst = store8888 (in (in (vsrca, vmask), vdest));
+	    *dst++ = d;
+	    w--;
+	}
 
-		dst += 4;
-		mask += 4;
-		w -= 4;
-	    }
+	while (w >= 4)
+	{
+	    __m64 vmask;
+	    __m64 vdest;
+
+	    vmask = load8888u ((uint32_t *)mask);
+	    vdest = load8888 ((uint32_t *)dst);
+
+	    store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
+
+	    dst += 4;
+	    mask += 4;
+	    w -= 4;
 	}
 
 	while (w--)
@@ -2701,25 +2890,15 @@ mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
 
 static void
 mmx_composite_in_8_8 (pixman_implementation_t *imp,
-                      pixman_op_t              op,
-                      pixman_image_t *         src_image,
-                      pixman_image_t *         mask_image,
-                      pixman_image_t *         dst_image,
-                      int32_t                  src_x,
-                      int32_t                  src_y,
-                      int32_t                  mask_x,
-                      int32_t                  mask_y,
-                      int32_t                  dest_x,
-                      int32_t                  dest_y,
-                      int32_t                  width,
-                      int32_t                  height)
+                      pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint8_t     *dst_line, *dst;
     uint8_t     *src_line, *src;
     int src_stride, dst_stride;
     int32_t w;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
 
     while (height--)
@@ -2730,20 +2909,31 @@ mmx_composite_in_8_8 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	if ((((unsigned long)dst_image & 3) == 0) &&
-	    (((unsigned long)src_image & 3) == 0))
+	while (w && (uintptr_t)dst & 3)
 	{
-	    while (w >= 4)
-	    {
-		uint32_t *s = (uint32_t *)src;
-		uint32_t *d = (uint32_t *)dst;
+	    uint8_t s, d;
+	    uint16_t tmp;
 
-		*d = store8888 (in (load8888 (*s), load8888 (*d)));
+	    s = *src;
+	    d = *dst;
 
-		w -= 4;
-		dst += 4;
-		src += 4;
-	    }
+	    *dst = MUL_UN8 (s, d, tmp);
+
+	    src++;
+	    dst++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    uint32_t *s = (uint32_t *)src;
+	    uint32_t *d = (uint32_t *)dst;
+
+	    store8888 (d, in (load8888u (s), load8888 (d)));
+
+	    w -= 4;
+	    dst += 4;
+	    src += 4;
 	}
 
 	while (w--)
@@ -2766,19 +2956,9 @@ mmx_composite_in_8_8 (pixman_implementation_t *imp,
 
 static void
 mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
-			 pixman_op_t              op,
-			 pixman_image_t *         src_image,
-			 pixman_image_t *         mask_image,
-			 pixman_image_t *         dst_image,
-			 int32_t                  src_x,
-			 int32_t                  src_y,
-			 int32_t                  mask_x,
-			 int32_t                  mask_y,
-			 int32_t                  dest_x,
-			 int32_t                  dest_y,
-			 int32_t                  width,
-			 int32_t                  height)
+			 pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint8_t     *dst_line, *dst;
     uint8_t     *mask_line, *mask;
     int dst_stride, mask_stride;
@@ -2787,17 +2967,17 @@ mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
     uint8_t sa;
     __m64 vsrc, vsrca;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     sa = src >> 24;
 
     if (src == 0)
 	return;
 
-    vsrc = load8888 (src);
+    vsrc = load8888 (&src);
     vsrca = expand_alpha (vsrc);
 
     while (height--)
@@ -2808,20 +2988,36 @@ mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	w = width;
 
-	if ((((unsigned long)mask_image & 3) == 0) &&
-	    (((unsigned long)dst_image  & 3) == 0))
+	while (w && (uintptr_t)dst & 3)
 	{
-	    while (w >= 4)
-	    {
-		__m64 vmask = load8888 (*(uint32_t *)mask);
-		__m64 vdest = load8888 (*(uint32_t *)dst);
+	    uint16_t tmp;
+	    uint16_t a;
+	    uint32_t m, d;
+	    uint32_t r;
 
-		*(uint32_t *)dst = store8888 (_mm_adds_pu8 (in (vsrca, vmask), vdest));
+	    a = *mask++;
+	    d = *dst;
 
-		w -= 4;
-		dst += 4;
-		mask += 4;
-	    }
+	    m = MUL_UN8 (sa, a, tmp);
+	    r = ADD_UN8 (m, d, tmp);
+
+	    *dst++ = r;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    __m64 vmask;
+	    __m64 vdest;
+
+	    vmask = load8888u ((uint32_t *)mask);
+	    vdest = load8888 ((uint32_t *)dst);
+
+	    store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
+
+	    dst += 4;
+	    mask += 4;
+	    w -= 4;
 	}
 
 	while (w--)
@@ -2846,19 +3042,9 @@ mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
 
 static void
 mmx_composite_add_8_8 (pixman_implementation_t *imp,
-		       pixman_op_t              op,
-		       pixman_image_t *         src_image,
-		       pixman_image_t *         mask_image,
-		       pixman_image_t *         dst_image,
-		       int32_t                  src_x,
-		       int32_t                  src_y,
-		       int32_t                  mask_x,
-		       int32_t                  mask_y,
-		       int32_t                  dest_x,
-		       int32_t                  dest_y,
-		       int32_t                  width,
-		       int32_t                  height)
+		       pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint8_t *dst_line, *dst;
     uint8_t *src_line, *src;
     int dst_stride, src_stride;
@@ -2869,7 +3055,7 @@ mmx_composite_add_8_8 (pixman_implementation_t *imp,
     CHECKPOINT ();
 
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
@@ -2879,7 +3065,7 @@ mmx_composite_add_8_8 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	while (w && (unsigned long)dst & 7)
+	while (w && (uintptr_t)dst & 7)
 	{
 	    s = *src;
 	    d = *dst;
@@ -2894,7 +3080,7 @@ mmx_composite_add_8_8 (pixman_implementation_t *imp,
 
 	while (w >= 8)
 	{
-	    *(__m64*)dst = _mm_adds_pu8 (*(__m64*)src, *(__m64*)dst);
+	    *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
 	    dst += 8;
 	    src += 8;
 	    w -= 8;
@@ -2918,30 +3104,21 @@ mmx_composite_add_8_8 (pixman_implementation_t *imp,
 }
 
 static void
-mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             pixman_image_t *         src_image,
-                             pixman_image_t *         mask_image,
-                             pixman_image_t *         dst_image,
-                             int32_t                  src_x,
-                             int32_t                  src_y,
-                             int32_t                  mask_x,
-                             int32_t                  mask_y,
-                             int32_t                  dest_x,
-                             int32_t                  dest_y,
-                             int32_t                  width,
-                             int32_t                  height)
+mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
+                             pixman_composite_info_t *info)
 {
-    __m64 dst64;
-    uint32_t    *dst_line, *dst;
-    uint32_t    *src_line, *src;
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t	d;
+    uint16_t    *src_line, *src;
+    uint32_t	s;
     int dst_stride, src_stride;
     int32_t w;
 
     CHECKPOINT ();
 
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
@@ -2951,10 +3128,92 @@ mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	while (w && (unsigned long)dst & 7)
+	while (w && (uintptr_t)dst & 7)
 	{
-	    *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
-	                                           _mm_cvtsi32_si64 (*dst)));
+	    s = *src++;
+	    if (s)
+	    {
+		d = *dst;
+		s = convert_0565_to_8888 (s);
+		if (d)
+		{
+		    d = convert_0565_to_8888 (d);
+		    UN8x4_ADD_UN8x4 (s, d);
+		}
+		*dst = convert_8888_to_0565 (s);
+	    }
+	    dst++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    __m64 vdest = *(__m64 *)dst;
+	    __m64 vsrc = ldq_u ((__m64 *)src);
+	    __m64 vd0, vd1;
+	    __m64 vs0, vs1;
+
+	    expand_4xpacked565 (vdest, &vd0, &vd1, 0);
+	    expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
+
+	    vd0 = _mm_adds_pu8 (vd0, vs0);
+	    vd1 = _mm_adds_pu8 (vd1, vs1);
+
+	    *(__m64 *)dst = pack_4xpacked565 (vd0, vd1);
+
+	    dst += 4;
+	    src += 4;
+	    w -= 4;
+	}
+
+	while (w--)
+	{
+	    s = *src++;
+	    if (s)
+	    {
+		d = *dst;
+		s = convert_0565_to_8888 (s);
+		if (d)
+		{
+		    d = convert_0565_to_8888 (d);
+		    UN8x4_ADD_UN8x4 (s, d);
+		}
+		*dst = convert_8888_to_0565 (s);
+	    }
+	    dst++;
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
+                             pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 7)
+	{
+	    store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
+	                              load ((const uint32_t *)dst)));
 	    dst++;
 	    src++;
 	    w--;
@@ -2962,8 +3221,7 @@ mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
 
 	while (w >= 2)
 	{
-	    dst64 = _mm_adds_pu8 (*(__m64*)src, *(__m64*)dst);
-	    *(uint64_t*)dst = to_uint64 (dst64);
+	    *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
 	    dst += 2;
 	    src += 2;
 	    w -= 2;
@@ -2971,8 +3229,8 @@ mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
 
 	if (w)
 	{
-	    *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
-	                                           _mm_cvtsi32_si64 (*dst)));
+	    store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
+	                              load ((const uint32_t *)dst)));
 
 	}
     }
@@ -2981,18 +3239,19 @@ mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
 }
 
 static pixman_bool_t
-pixman_blt_mmx (uint32_t *src_bits,
-                uint32_t *dst_bits,
-                int       src_stride,
-                int       dst_stride,
-                int       src_bpp,
-                int       dst_bpp,
-                int       src_x,
-                int       src_y,
-                int       dst_x,
-                int       dst_y,
-                int       width,
-                int       height)
+mmx_blt (pixman_implementation_t *imp,
+         uint32_t *               src_bits,
+         uint32_t *               dst_bits,
+         int                      src_stride,
+         int                      dst_stride,
+         int                      src_bpp,
+         int                      dst_bpp,
+         int                      src_x,
+         int                      src_y,
+         int                      dest_x,
+         int                      dest_y,
+         int                      width,
+         int                      height)
 {
     uint8_t *   src_bytes;
     uint8_t *   dst_bytes;
@@ -3006,7 +3265,7 @@ pixman_blt_mmx (uint32_t *src_bits,
 	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
 	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
 	src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
-	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
+	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
 	byte_width = 2 * width;
 	src_stride *= 2;
 	dst_stride *= 2;
@@ -3016,7 +3275,7 @@ pixman_blt_mmx (uint32_t *src_bits,
 	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
 	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
 	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
-	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
+	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
 	byte_width = 4 * width;
 	src_stride *= 4;
 	dst_stride *= 4;
@@ -3035,7 +3294,15 @@ pixman_blt_mmx (uint32_t *src_bits,
 	dst_bytes += dst_stride;
 	w = byte_width;
 
-	while (w >= 2 && ((unsigned long)d & 3))
+	if (w >= 1 && ((uintptr_t)d & 1))
+	{
+	    *(uint8_t *)d = *(uint8_t *)s;
+	    w -= 1;
+	    s += 1;
+	    d += 1;
+	}
+
+	if (w >= 2 && ((uintptr_t)d & 3))
 	{
 	    *(uint16_t *)d = *(uint16_t *)s;
 	    w -= 2;
@@ -3043,9 +3310,9 @@ pixman_blt_mmx (uint32_t *src_bits,
 	    d += 2;
 	}
 
-	while (w >= 4 && ((unsigned long)d & 7))
+	while (w >= 4 && ((uintptr_t)d & 7))
 	{
-	    *(uint32_t *)d = *(uint32_t *)s;
+	    *(uint32_t *)d = ldl_u ((uint32_t *)s);
 
 	    w -= 4;
 	    s += 4;
@@ -3054,7 +3321,7 @@ pixman_blt_mmx (uint32_t *src_bits,
 
 	while (w >= 64)
 	{
-#if defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
+#if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
 	    __asm__ (
 	        "movq	  (%1),	  %%mm0\n"
 	        "movq	 8(%1),	  %%mm1\n"
@@ -3079,14 +3346,14 @@ pixman_blt_mmx (uint32_t *src_bits,
 		  "%mm0", "%mm1", "%mm2", "%mm3",
 		  "%mm4", "%mm5", "%mm6", "%mm7");
 #else
-	    __m64 v0 = *(__m64 *)(s + 0);
-	    __m64 v1 = *(__m64 *)(s + 8);
-	    __m64 v2 = *(__m64 *)(s + 16);
-	    __m64 v3 = *(__m64 *)(s + 24);
-	    __m64 v4 = *(__m64 *)(s + 32);
-	    __m64 v5 = *(__m64 *)(s + 40);
-	    __m64 v6 = *(__m64 *)(s + 48);
-	    __m64 v7 = *(__m64 *)(s + 56);
+	    __m64 v0 = ldq_u ((__m64 *)(s + 0));
+	    __m64 v1 = ldq_u ((__m64 *)(s + 8));
+	    __m64 v2 = ldq_u ((__m64 *)(s + 16));
+	    __m64 v3 = ldq_u ((__m64 *)(s + 24));
+	    __m64 v4 = ldq_u ((__m64 *)(s + 32));
+	    __m64 v5 = ldq_u ((__m64 *)(s + 40));
+	    __m64 v6 = ldq_u ((__m64 *)(s + 48));
+	    __m64 v7 = ldq_u ((__m64 *)(s + 56));
 	    *(__m64 *)(d + 0)  = v0;
 	    *(__m64 *)(d + 8)  = v1;
 	    *(__m64 *)(d + 16) = v2;
@@ -3103,7 +3370,7 @@ pixman_blt_mmx (uint32_t *src_bits,
 	}
 	while (w >= 4)
 	{
-	    *(uint32_t *)d = *(uint32_t *)s;
+	    *(uint32_t *)d = ldl_u ((uint32_t *)s);
 
 	    w -= 4;
 	    s += 4;
@@ -3125,51 +3392,31 @@ pixman_blt_mmx (uint32_t *src_bits,
 
 static void
 mmx_composite_copy_area (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         pixman_image_t *         src_image,
-                         pixman_image_t *         mask_image,
-                         pixman_image_t *         dst_image,
-                         int32_t                  src_x,
-                         int32_t                  src_y,
-                         int32_t                  mask_x,
-                         int32_t                  mask_y,
-                         int32_t                  dest_x,
-                         int32_t                  dest_y,
-                         int32_t                  width,
-                         int32_t                  height)
+                         pixman_composite_info_t *info)
 {
-    pixman_blt_mmx (src_image->bits.bits,
-                    dst_image->bits.bits,
-                    src_image->bits.rowstride,
-                    dst_image->bits.rowstride,
-                    PIXMAN_FORMAT_BPP (src_image->bits.format),
-                    PIXMAN_FORMAT_BPP (dst_image->bits.format),
-                    src_x, src_y, dest_x, dest_y, width, height);
+    PIXMAN_COMPOSITE_ARGS (info);
+
+    mmx_blt (imp, src_image->bits.bits,
+	     dest_image->bits.bits,
+	     src_image->bits.rowstride,
+	     dest_image->bits.rowstride,
+	     PIXMAN_FORMAT_BPP (src_image->bits.format),
+	     PIXMAN_FORMAT_BPP (dest_image->bits.format),
+	     src_x, src_y, dest_x, dest_y, width, height);
 }
 
-#if 0
 static void
 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
-                                pixman_op_t              op,
-                                pixman_image_t *         src_image,
-                                pixman_image_t *         mask_image,
-                                pixman_image_t *         dst_image,
-                                int32_t                  src_x,
-                                int32_t                  src_y,
-                                int32_t                  mask_x,
-                                int32_t                  mask_y,
-                                int32_t                  dest_x,
-                                int32_t                  dest_y,
-                                int32_t                  width,
-                                int32_t                  height)
+                                pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t  *src, *src_line;
     uint32_t  *dst, *dst_line;
     uint8_t  *mask, *mask_line;
     int src_stride, mask_stride, dst_stride;
     int32_t w;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
@@ -3190,19 +3437,20 @@ mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
 
 	    if (m)
 	    {
-		__m64 s = load8888 (*src | 0xff000000);
+		uint32_t ssrc = *src | 0xff000000;
+		__m64 s = load8888 (&ssrc);
 
 		if (m == 0xff)
 		{
-		    *dst = store8888 (s);
+		    store8888 (dst, s);
 		}
 		else
 		{
 		    __m64 sa = expand_alpha (s);
 		    __m64 vm = expand_alpha_rev (to_m64 (m));
-		    __m64 vdest = in_over (s, sa, vm, load8888 (*dst));
+		    __m64 vdest = in_over (s, sa, vm, load8888 (dst));
 
-		    *dst = store8888 (vdest);
+		    store8888 (dst, vdest);
 		}
 	    }
 
@@ -3214,7 +3462,489 @@ mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
 
     _mm_empty ();
 }
-#endif
+
+static void
+mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
+                                   pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, *dst;
+    int32_t w;
+    int dst_stride;
+    __m64 vsrc;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    vsrc = load8888 (&src);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	w = width;
+
+	CHECKPOINT ();
+
+	while (w && (uintptr_t)dst & 7)
+	{
+	    __m64 vdest = load8888 (dst);
+
+	    store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
+
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 2)
+	{
+	    __m64 vdest = *(__m64 *)dst;
+	    __m64 dest0 = expand8888 (vdest, 0);
+	    __m64 dest1 = expand8888 (vdest, 1);
+
+
+	    dest0 = over (dest0, expand_alpha (dest0), vsrc);
+	    dest1 = over (dest1, expand_alpha (dest1), vsrc);
+
+	    *(__m64 *)dst = pack8888 (dest0, dest1);
+
+	    dst += 2;
+	    w -= 2;
+	}
+
+	CHECKPOINT ();
+
+	if (w)
+	{
+	    __m64 vdest = load8888 (dst);
+
+	    store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
+	}
+    }
+
+    _mm_empty ();
+}
+
+#define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
+#define BMSK (BSHIFT - 1)
+
+#define BILINEAR_DECLARE_VARIABLES						\
+    const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt);				\
+    const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb);				\
+    const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT);	\
+    const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1);				\
+    const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK);			\
+    const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x);		\
+    const __m64 mm_zero = _mm_setzero_si64 ();					\
+    __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
+do {										\
+    /* fetch 2x2 pixel block into 2 mmx registers */				\
+    __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]);		\
+    __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]);		\
+    /* vertical interpolation */						\
+    __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt);		\
+    __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt);		\
+    __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb);		\
+    __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb);		\
+    __m64 hi = _mm_add_pi16 (t_hi, b_hi);					\
+    __m64 lo = _mm_add_pi16 (t_lo, b_lo);					\
+    vx += unit_x;								\
+    if (BILINEAR_INTERPOLATION_BITS < 8)					\
+    {										\
+	/* calculate horizontal weights */					\
+	__m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7,		\
+			  _mm_srli_pi16 (mm_x,					\
+					 16 - BILINEAR_INTERPOLATION_BITS)));	\
+	/* horizontal interpolation */						\
+	__m64 p = _mm_unpacklo_pi16 (lo, hi);					\
+	__m64 q = _mm_unpackhi_pi16 (lo, hi);					\
+	lo = _mm_madd_pi16 (p, mm_wh);						\
+	hi = _mm_madd_pi16 (q, mm_wh);						\
+    }										\
+    else									\
+    {										\
+	/* calculate horizontal weights */					\
+	__m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x,		\
+					16 - BILINEAR_INTERPOLATION_BITS));	\
+	__m64 mm_wh_hi = _mm_srli_pi16 (mm_x,					\
+					16 - BILINEAR_INTERPOLATION_BITS);	\
+	/* horizontal interpolation */						\
+	__m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo);				\
+	__m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi);				\
+	__m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo);				\
+	__m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi);				\
+	lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo),		\
+			   _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi));		\
+	hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo),		\
+			   _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi));		\
+    }										\
+    mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
+    /* shift and pack the result */						\
+    hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2);			\
+    lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2);			\
+    lo = _mm_packs_pi32 (lo, hi);						\
+    lo = _mm_packs_pu16 (lo, lo);						\
+    pix = lo;									\
+} while (0)
+
+#define BILINEAR_SKIP_ONE_PIXEL()						\
+do {										\
+    vx += unit_x;								\
+    mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
+} while(0)
+
+static force_inline void
+scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t *       dst,
+					    const uint32_t * mask,
+					    const uint32_t * src_top,
+					    const uint32_t * src_bottom,
+					    int32_t          w,
+					    int              wt,
+					    int              wb,
+					    pixman_fixed_t   vx,
+					    pixman_fixed_t   unit_x,
+					    pixman_fixed_t   max_vx,
+					    pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    __m64 pix;
+
+    while (w--)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
+	store (dst, pix);
+	dst++;
+    }
+
+    _mm_empty ();
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
+			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
+			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
+			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
+			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t *       dst,
+					     const uint32_t * mask,
+					     const uint32_t * src_top,
+					     const uint32_t * src_bottom,
+					     int32_t          w,
+					     int              wt,
+					     int              wb,
+					     pixman_fixed_t   vx,
+					     pixman_fixed_t   unit_x,
+					     pixman_fixed_t   max_vx,
+					     pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    __m64 pix1, pix2;
+
+    while (w)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+	if (!is_zero (pix1))
+	{
+	    pix2 = load (dst);
+	    store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
+	}
+
+	w--;
+	dst++;
+    }
+
+    _mm_empty ();
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t *       dst,
+					       const uint8_t  * mask,
+					       const uint32_t * src_top,
+					       const uint32_t * src_bottom,
+					       int32_t          w,
+					       int              wt,
+					       int              wb,
+					       pixman_fixed_t   vx,
+					       pixman_fixed_t   unit_x,
+					       pixman_fixed_t   max_vx,
+					       pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    __m64 pix1, pix2;
+    uint32_t m;
+
+    while (w)
+    {
+	m = (uint32_t) *mask++;
+
+	if (m)
+	{
+	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+	    if (m == 0xff && is_opaque (pix1))
+	    {
+		store (dst, pix1);
+	    }
+	    else
+	    {
+		__m64 ms, md, ma, msa;
+
+		pix2 = load (dst);
+		ma = expand_alpha_rev (to_m64 (m));
+		ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
+		md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
+
+		msa = expand_alpha (ms);
+
+		store8888 (dst, (in_over (ms, msa, ma, md)));
+	    }
+	}
+	else
+	{
+	    BILINEAR_SKIP_ONE_PIXEL ();
+	}
+
+	w--;
+	dst++;
+    }
+
+    _mm_empty ();
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       COVER, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       PAD, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       NONE, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
+
+static uint32_t *
+mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint32_t *src = (uint32_t *)iter->bits;
+
+    iter->bits += iter->stride;
+
+    while (w && ((uintptr_t)dst) & 7)
+    {
+	*dst++ = (*src++) | 0xff000000;
+	w--;
+    }
+
+    while (w >= 8)
+    {
+	__m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
+	__m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
+	__m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
+	__m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
+
+	*(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
+	*(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
+	*(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
+	*(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
+
+	dst += 8;
+	src += 8;
+	w -= 8;
+    }
+
+    while (w)
+    {
+	*dst++ = (*src++) | 0xff000000;
+	w--;
+    }
+
+    _mm_empty ();
+    return iter->buffer;
+}
+
+static uint32_t *
+mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint16_t *src = (uint16_t *)iter->bits;
+
+    iter->bits += iter->stride;
+
+    while (w && ((uintptr_t)dst) & 0x0f)
+    {
+	uint16_t s = *src++;
+
+	*dst++ = convert_0565_to_8888 (s);
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m64 vsrc = ldq_u ((__m64 *)src);
+	__m64 mm0, mm1;
+
+	expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
+
+	*(__m64 *)(dst + 0) = mm0;
+	*(__m64 *)(dst + 2) = mm1;
+
+	dst += 4;
+	src += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	uint16_t s = *src++;
+
+	*dst++ = convert_0565_to_8888 (s);
+	w--;
+    }
+
+    _mm_empty ();
+    return iter->buffer;
+}
+
+static uint32_t *
+mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint8_t *src = iter->bits;
+
+    iter->bits += iter->stride;
+
+    while (w && (((uintptr_t)dst) & 15))
+    {
+        *dst++ = *(src++) << 24;
+        w--;
+    }
+
+    while (w >= 8)
+    {
+	__m64 mm0 = ldq_u ((__m64 *)src);
+
+	__m64 mm1 = _mm_unpacklo_pi8  (_mm_setzero_si64(), mm0);
+	__m64 mm2 = _mm_unpackhi_pi8  (_mm_setzero_si64(), mm0);
+	__m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
+	__m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
+	__m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
+	__m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
+
+	*(__m64 *)(dst + 0) = mm3;
+	*(__m64 *)(dst + 2) = mm4;
+	*(__m64 *)(dst + 4) = mm5;
+	*(__m64 *)(dst + 6) = mm6;
+
+	dst += 8;
+	src += 8;
+	w -= 8;
+    }
+
+    while (w)
+    {
+	*dst++ = *(src++) << 24;
+	w--;
+    }
+
+    _mm_empty ();
+    return iter->buffer;
+}
+
+typedef struct
+{
+    pixman_format_code_t	format;
+    pixman_iter_get_scanline_t	get_scanline;
+} fetcher_info_t;
+
+static const fetcher_info_t fetchers[] =
+{
+    { PIXMAN_x8r8g8b8,		mmx_fetch_x8r8g8b8 },
+    { PIXMAN_r5g6b5,		mmx_fetch_r5g6b5 },
+    { PIXMAN_a8,		mmx_fetch_a8 },
+    { PIXMAN_null }
+};
+
+static pixman_bool_t
+mmx_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+
+#define FLAGS								\
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
+     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
+
+    if ((iter->iter_flags & ITER_NARROW)			&&
+	(iter->image_flags & FLAGS) == FLAGS)
+    {
+	const fetcher_info_t *f;
+
+	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
+	{
+	    if (image->common.extended_format_code == f->format)
+	    {
+		uint8_t *b = (uint8_t *)image->bits.bits;
+		int s = image->bits.rowstride * 4;
+
+		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
+		iter->stride = s;
+
+		iter->get_scanline = f->get_scanline;
+		return TRUE;
+	    }
+	}
+    }
+
+    return FALSE;
+}
 
 static const pixman_fast_path_t mmx_fast_paths[] =
 {
@@ -3244,18 +3974,14 @@ static const pixman_fast_path_t mmx_fast_paths[] =
     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_8888_n_8888    ),
     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_8888_n_8888    ),
     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_8888_n_8888    ),
-#if 0
-    /* FIXME: This code is commented out since it's apparently
-     * not actually faster than the generic code.
-     */
     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, mmx_composite_over_x888_8_8888    ),
     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
-    PIXMAN_STD_FAST_PATH    (OVER, x8b8r8g8, a8,       x8b8g8r8, mmx_composite_over_x888_8_8888    ),
-    PIXMAN_STD_FAST_PATH    (OVER, x8b8r8g8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
-#endif
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       x8b8g8r8, mmx_composite_over_x888_8_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       a8b8g8r8, mmx_composite_over_x888_8_8888    ),
     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, mmx_composite_over_n_8888         ),
     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, mmx_composite_over_n_8888         ),
     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   mmx_composite_over_n_0565         ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     b5g6r5,   mmx_composite_over_n_0565         ),
     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
 
@@ -3266,11 +3992,20 @@ static const pixman_fast_path_t mmx_fast_paths[] =
     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, mmx_composite_over_8888_8888      ),
     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   mmx_composite_over_8888_0565      ),
 
+    PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888),
+    PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888),
+
+    PIXMAN_STD_FAST_PATH    (ADD,  r5g6b5,   null,     r5g6b5,   mmx_composite_add_0565_0565       ),
+    PIXMAN_STD_FAST_PATH    (ADD,  b5g6r5,   null,     b5g6r5,   mmx_composite_add_0565_0565       ),
     PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_add_8888_8888       ),
     PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_add_8888_8888       ),
     PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8_8		   ),
     PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ),
 
+    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
+    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
+    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ),
     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, mmx_composite_src_n_8_8888        ),
     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, mmx_composite_src_n_8_8888        ),
@@ -3287,63 +4022,30 @@ static const pixman_fast_path_t mmx_fast_paths[] =
     PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
     PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
 
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          a8r8g8b8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          a8b8g8r8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
+
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         x8r8g8b8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         x8b8g8r8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         a8r8g8b8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         a8b8g8r8, mmx_8888_8888                     ),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888                   ),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888                   ),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888                   ),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888                   ),
+
     { PIXMAN_OP_NONE },
 };
 
-static pixman_bool_t
-mmx_blt (pixman_implementation_t *imp,
-         uint32_t *               src_bits,
-         uint32_t *               dst_bits,
-         int                      src_stride,
-         int                      dst_stride,
-         int                      src_bpp,
-         int                      dst_bpp,
-         int                      src_x,
-         int                      src_y,
-         int                      dst_x,
-         int                      dst_y,
-         int                      width,
-         int                      height)
-{
-    if (!pixman_blt_mmx (
-            src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
-            src_x, src_y, dst_x, dst_y, width, height))
-
-    {
-	return _pixman_implementation_blt (
-	    imp->delegate,
-	    src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
-	    src_x, src_y, dst_x, dst_y, width, height);
-    }
-
-    return TRUE;
-}
-
-static pixman_bool_t
-mmx_fill (pixman_implementation_t *imp,
-          uint32_t *               bits,
-          int                      stride,
-          int                      bpp,
-          int                      x,
-          int                      y,
-          int                      width,
-          int                      height,
-          uint32_t xor)
-{
-    if (!pixman_fill_mmx (bits, stride, bpp, x, y, width, height, xor))
-    {
-	return _pixman_implementation_fill (
-	    imp->delegate, bits, stride, bpp, x, y, width, height, xor);
-    }
-
-    return TRUE;
-}
-
 pixman_implementation_t *
-_pixman_implementation_create_mmx (void)
+_pixman_implementation_create_mmx (pixman_implementation_t *fallback)
 {
-    pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
-    pixman_implementation_t *imp = _pixman_implementation_create (general, mmx_fast_paths);
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
 
     imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
@@ -3372,7 +4074,9 @@ _pixman_implementation_create_mmx (void)
     imp->blt = mmx_blt;
     imp->fill = mmx_fill;
 
+    imp->src_iter_init = mmx_src_iter_init;
+
     return imp;
 }
 
-#endif /* USE_MMX */
+#endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */
diff --git a/programs/develop/libraries/pixman/pixman-noop.c b/programs/develop/libraries/pixman/pixman-noop.c
new file mode 100644
index 0000000000..e39996d9df
--- /dev/null
+++ b/programs/develop/libraries/pixman/pixman-noop.c
@@ -0,0 +1,176 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2011 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <string.h>
+#include <stdlib.h>
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "pixman-inlines.h"
+
+static void
+noop_composite (pixman_implementation_t *imp,
+		pixman_composite_info_t *info)
+{
+    return;
+}
+
+static void
+dest_write_back_direct (pixman_iter_t *iter)
+{
+    iter->buffer += iter->image->bits.rowstride;
+}
+
+static uint32_t *
+noop_get_scanline (pixman_iter_t *iter, const uint32_t *mask)
+{
+    uint32_t *result = iter->buffer;
+
+    iter->buffer += iter->image->bits.rowstride;
+
+    return result;
+}
+
+static uint32_t *
+get_scanline_null (pixman_iter_t *iter, const uint32_t *mask)
+{
+    return NULL;
+}
+
+static pixman_bool_t
+noop_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+
+#define FLAGS						\
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
+
+    if (!image)
+    {
+	iter->get_scanline = get_scanline_null;
+    }
+    else if ((iter->iter_flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) ==
+	     (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB))
+    {
+	iter->get_scanline = _pixman_iter_get_scanline_noop;
+    }
+    else if (image->common.extended_format_code == PIXMAN_solid		&&
+	     (iter->image->type == SOLID ||
+	      (iter->image_flags & FAST_PATH_NO_ALPHA_MAP)))
+    {
+	if (iter->iter_flags & ITER_NARROW)
+	{
+	    uint32_t *buffer = iter->buffer;
+	    uint32_t *end = buffer + iter->width;
+	    uint32_t color;
+
+	    if (image->type == SOLID)
+		color = image->solid.color_32;
+	    else
+		color = image->bits.fetch_pixel_32 (&image->bits, 0, 0);
+
+	    while (buffer < end)
+		*(buffer++) = color;
+	}
+	else
+	{
+	    argb_t *buffer = (argb_t *)iter->buffer;
+	    argb_t *end = buffer + iter->width;
+	    argb_t color;
+
+	    if (image->type == SOLID)
+		color = image->solid.color_float;
+	    else
+		color = image->bits.fetch_pixel_float (&image->bits, 0, 0);
+
+	    while (buffer < end)
+		*(buffer++) = color;
+	}
+
+	iter->get_scanline = _pixman_iter_get_scanline_noop;
+    }
+    else if (image->common.extended_format_code == PIXMAN_a8r8g8b8	&&
+	     (iter->iter_flags & ITER_NARROW)				&&
+	     (iter->image_flags & FLAGS) == FLAGS			&&
+	     iter->x >= 0 && iter->y >= 0				&&
+	     iter->x + iter->width <= image->bits.width			&&
+	     iter->y + iter->height <= image->bits.height)
+    {
+	iter->buffer =
+	    image->bits.bits + iter->y * image->bits.rowstride + iter->x;
+
+	iter->get_scanline = noop_get_scanline;
+    }
+    else
+    {
+	return FALSE;
+    }
+
+    return TRUE;
+}
+
+static pixman_bool_t
+noop_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+    uint32_t image_flags = iter->image_flags;
+    uint32_t iter_flags = iter->iter_flags;
+    
+    if ((image_flags & FAST_PATH_STD_DEST_FLAGS) == FAST_PATH_STD_DEST_FLAGS	&&
+	(iter_flags & ITER_NARROW) == ITER_NARROW				&&
+	((image->common.extended_format_code == PIXMAN_a8r8g8b8)	||
+	 (image->common.extended_format_code == PIXMAN_x8r8g8b8 &&
+	  (iter_flags & (ITER_LOCALIZED_ALPHA)))))
+    {
+	iter->buffer = image->bits.bits + iter->y * image->bits.rowstride + iter->x;
+
+	iter->get_scanline = _pixman_iter_get_scanline_noop;
+	iter->write_back = dest_write_back_direct;
+
+	return TRUE;
+    }
+    else
+    {
+	return FALSE;
+    }
+}
+
+static const pixman_fast_path_t noop_fast_paths[] =
+{
+    { PIXMAN_OP_DST, PIXMAN_any, 0, PIXMAN_any, 0, PIXMAN_any, 0, noop_composite },
+    { PIXMAN_OP_NONE },
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_noop (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp =
+	_pixman_implementation_create (fallback, noop_fast_paths);
+ 
+    imp->src_iter_init = noop_src_iter_init;
+    imp->dest_iter_init = noop_dest_iter_init;
+
+    return imp;
+}
diff --git a/programs/develop/libraries/pixman/pixman-private.h b/programs/develop/libraries/pixman/pixman-private.h
index e7eaca7649..6d9c05321d 100644
--- a/programs/develop/libraries/pixman/pixman-private.h
+++ b/programs/develop/libraries/pixman/pixman-private.h
@@ -1,7 +1,26 @@
+#include <float.h>
 
 #ifndef PIXMAN_PRIVATE_H
 #define PIXMAN_PRIVATE_H
 
+/*
+ * The defines which are shared between C and assembly code
+ */
+
+/* bilinear interpolation precision (must be <= 8) */
+#define BILINEAR_INTERPOLATION_BITS 7
+#define BILINEAR_INTERPOLATION_RANGE (1 << BILINEAR_INTERPOLATION_BITS)
+
+/*
+ * C specific part
+ */
+
+#ifndef __ASSEMBLER__
+
+#ifndef PACKAGE
+#  error config.h must be included before pixman-private.h
+#endif
+
 #define PIXMAN_DISABLE_DEPRECATED
 #define PIXMAN_USE_INTERNAL_API
 
@@ -10,6 +29,7 @@
 #include <assert.h>
 #include <stdio.h>
 #include <string.h>
+#include <stddef.h>
 
 #include "pixman-compiler.h"
 
@@ -17,7 +37,6 @@
  * Images
  */
 typedef struct image_common image_common_t;
-typedef struct source_image source_image_t;
 typedef struct solid_fill solid_fill_t;
 typedef struct gradient gradient_t;
 typedef struct linear_gradient linear_gradient_t;
@@ -28,6 +47,16 @@ typedef struct radial_gradient radial_gradient_t;
 typedef struct bits_image bits_image_t;
 typedef struct circle circle_t;
 
+typedef struct argb_t argb_t;
+
+struct argb_t
+{
+    float a;
+    float r;
+    float g;
+    float b;
+};
+
 typedef void (*fetch_scanline_t) (pixman_image_t *image,
 				  int             x,
 				  int             y,
@@ -39,9 +68,9 @@ typedef uint32_t (*fetch_pixel_32_t) (bits_image_t *image,
 				      int           x,
 				      int           y);
 
-typedef uint64_t (*fetch_pixel_64_t) (bits_image_t *image,
-				      int           x,
-				      int           y);
+typedef argb_t (*fetch_pixel_float_t) (bits_image_t *image,
+				       int           x,
+				       int           y);
 
 typedef void (*store_scanline_t) (bits_image_t *  image,
 				  int             x,
@@ -58,17 +87,6 @@ typedef enum
     SOLID
 } image_type_t;
 
-typedef enum
-{
-    SOURCE_IMAGE_CLASS_UNKNOWN,
-    SOURCE_IMAGE_CLASS_HORIZONTAL,
-} source_image_class_t;
-
-typedef source_image_class_t (*classify_func_t) (pixman_image_t *image,
-						int             x,
-						int             y,
-						int             width,
-						int             height);
 typedef void (*property_changed_func_t) (pixman_image_t *image);
 
 struct image_common
@@ -93,10 +111,7 @@ struct image_common
     int                         alpha_origin_x;
     int                         alpha_origin_y;
     pixman_bool_t               component_alpha;
-    classify_func_t             classify;
     property_changed_func_t     property_changed;
-    fetch_scanline_t            get_scanline_32;
-    fetch_scanline_t            get_scanline_64;
 
     pixman_image_destroy_func_t destroy_func;
     void *                      destroy_data;
@@ -105,26 +120,20 @@ struct image_common
     pixman_format_code_t	extended_format_code;
 };
 
-struct source_image
-{
-    image_common_t common;
-};
-
 struct solid_fill
 {
-    source_image_t common;
+    image_common_t common;
     pixman_color_t color;
 
     uint32_t	   color_32;
-    uint64_t	   color_64;
+    argb_t	   color_float;
 };
 
 struct gradient
 {
-    source_image_t          common;
+    image_common_t	    common;
     int                     n_stops;
     pixman_gradient_stop_t *stops;
-    int                     stop_range;
 };
 
 struct linear_gradient
@@ -176,9 +185,9 @@ struct bits_image
     fetch_pixel_32_t	       fetch_pixel_32;
     store_scanline_t           store_scanline_32;
 
-    fetch_scanline_t           fetch_scanline_64;
-    fetch_pixel_64_t	       fetch_pixel_64;
-    store_scanline_t           store_scanline_64;
+    fetch_scanline_t	       fetch_scanline_float;
+    fetch_pixel_float_t	       fetch_pixel_float;
+    store_scanline_t           store_scanline_float;
 
     /* Used for indirect access to the bits */
     pixman_read_memory_func_t  read_func;
@@ -190,7 +199,6 @@ union pixman_image
     image_type_t       type;
     image_common_t     common;
     bits_image_t       bits;
-    source_image_t     source;
     gradient_t         gradient;
     linear_gradient_t  linear;
     conical_gradient_t conical;
@@ -198,59 +206,86 @@ union pixman_image
     solid_fill_t       solid;
 };
 
+typedef struct pixman_iter_t pixman_iter_t;
+typedef uint32_t *(* pixman_iter_get_scanline_t) (pixman_iter_t *iter, const uint32_t *mask);
+typedef void      (* pixman_iter_write_back_t)   (pixman_iter_t *iter);
+
+typedef enum
+{
+    ITER_NARROW =		(1 << 0),
+
+    /* "Localized alpha" is when the alpha channel is used only to compute
+     * the alpha value of the destination. This means that the computation
+     * of the RGB values of the result is independent of the alpha value.
+     *
+     * For example, the OVER operator has localized alpha for the
+     * destination, because the RGB values of the result can be computed
+     * without knowing the destination alpha. Similarly, ADD has localized
+     * alpha for both source and destination because the RGB values of the
+     * result can be computed without knowing the alpha value of source or
+     * destination.
+     *
+     * When he destination is xRGB, this is useful knowledge, because then
+     * we can treat it as if it were ARGB, which means in some cases we can
+     * avoid copying it to a temporary buffer.
+     */
+    ITER_LOCALIZED_ALPHA =	(1 << 1),
+    ITER_IGNORE_ALPHA =		(1 << 2),
+    ITER_IGNORE_RGB =		(1 << 3)
+} iter_flags_t;
+
+struct pixman_iter_t
+{
+    /* These are initialized by _pixman_implementation_{src,dest}_init */
+    pixman_image_t *		image;
+    uint32_t *			buffer;
+    int				x, y;
+    int				width;
+    int				height;
+    iter_flags_t		iter_flags;
+    uint32_t			image_flags;
+
+    /* These function pointers are initialized by the implementation */
+    pixman_iter_get_scanline_t	get_scanline;
+    pixman_iter_write_back_t	write_back;
+
+    /* These fields are scratch data that implementations can use */
+    void *			data;
+    uint8_t *			bits;
+    int				stride;
+};
+
 void
 _pixman_bits_image_setup_accessors (bits_image_t *image);
 
 void
-_pixman_image_get_scanline_generic_64  (pixman_image_t *image,
-                                        int             x,
-                                        int             y,
-                                        int             width,
-                                        uint32_t *      buffer,
-                                        const uint32_t *mask);
-
-source_image_class_t
-_pixman_image_classify (pixman_image_t *image,
-                        int             x,
-                        int             y,
-                        int             width,
-                        int             height);
+_pixman_bits_image_src_iter_init (pixman_image_t *image, pixman_iter_t *iter);
 
 void
-_pixman_image_get_scanline_32 (pixman_image_t *image,
-                               int             x,
-                               int             y,
-                               int             width,
-                               uint32_t *      buffer,
-                               const uint32_t *mask);
-
-/* Even thought the type of buffer is uint32_t *, the function actually expects
- * a uint64_t *buffer.
- */
-void
-_pixman_image_get_scanline_64 (pixman_image_t *image,
-                               int             x,
-                               int             y,
-                               int             width,
-                               uint32_t *      buffer,
-                               const uint32_t *unused);
+_pixman_bits_image_dest_iter_init (pixman_image_t *image, pixman_iter_t *iter);
 
 void
-_pixman_image_store_scanline_32 (bits_image_t *  image,
-                                 int             x,
-                                 int             y,
-                                 int             width,
-                                 const uint32_t *buffer);
+_pixman_linear_gradient_iter_init (pixman_image_t *image, pixman_iter_t  *iter);
 
-/* Even though the type of buffer is uint32_t *, the function
- * actually expects a uint64_t *buffer.
- */
 void
-_pixman_image_store_scanline_64 (bits_image_t *  image,
-                                 int             x,
-                                 int             y,
-                                 int             width,
-                                 const uint32_t *buffer);
+_pixman_radial_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter);
+
+void
+_pixman_conical_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter);
+
+void
+_pixman_image_init (pixman_image_t *image);
+
+pixman_bool_t
+_pixman_bits_image_init (pixman_image_t *     image,
+                         pixman_format_code_t format,
+                         int                  width,
+                         int                  height,
+                         uint32_t *           bits,
+                         int                  rowstride,
+			 pixman_bool_t	      clear);
+pixman_bool_t
+_pixman_image_fini (pixman_image_t *image);
 
 pixman_image_t *
 _pixman_image_allocate (void);
@@ -265,10 +300,6 @@ _pixman_image_reset_clip_region (pixman_image_t *image);
 void
 _pixman_image_validate (pixman_image_t *image);
 
-uint32_t
-_pixman_image_get_solid (pixman_image_t *     image,
-                         pixman_format_code_t format);
-
 #define PIXMAN_IMAGE_GET_LINE(image, x, y, type, out_stride, line, mul)	\
     do									\
     {									\
@@ -288,33 +319,32 @@ _pixman_image_get_solid (pixman_image_t *     image,
  */
 typedef struct
 {
-    uint32_t                left_ag;
-    uint32_t                left_rb;
-    uint32_t                right_ag;
-    uint32_t                right_rb;
-    int32_t                 left_x;
-    int32_t                 right_x;
-    int32_t                 stepper;
+    float		    a_s, a_b;
+    float		    r_s, r_b;
+    float		    g_s, g_b;
+    float		    b_s, b_b;
+    pixman_fixed_t	    left_x;
+    pixman_fixed_t          right_x;
 
     pixman_gradient_stop_t *stops;
     int                     num_stops;
-    unsigned int            spread;
+    pixman_repeat_t	    repeat;
 
-    int                     need_reset;
+    pixman_bool_t           need_reset;
 } pixman_gradient_walker_t;
 
 void
 _pixman_gradient_walker_init (pixman_gradient_walker_t *walker,
                               gradient_t *              gradient,
-                              unsigned int              spread);
+			      pixman_repeat_t           repeat);
 
 void
 _pixman_gradient_walker_reset (pixman_gradient_walker_t *walker,
-                               pixman_fixed_32_32_t      pos);
+                               pixman_fixed_48_16_t      pos);
 
 uint32_t
 _pixman_gradient_walker_pixel (pixman_gradient_walker_t *walker,
-                               pixman_fixed_32_32_t      x);
+                               pixman_fixed_48_16_t      x);
 
 /*
  * Edges
@@ -352,6 +382,40 @@ pixman_rasterize_edges_accessors (pixman_image_t *image,
  */
 typedef struct pixman_implementation_t pixman_implementation_t;
 
+typedef struct
+{
+    pixman_op_t              op;
+    pixman_image_t *         src_image;
+    pixman_image_t *         mask_image;
+    pixman_image_t *         dest_image;
+    int32_t                  src_x;
+    int32_t                  src_y;
+    int32_t                  mask_x;
+    int32_t                  mask_y;
+    int32_t                  dest_x;
+    int32_t                  dest_y;
+    int32_t                  width;
+    int32_t                  height;
+
+    uint32_t                 src_flags;
+    uint32_t                 mask_flags;
+    uint32_t                 dest_flags;
+} pixman_composite_info_t;
+
+#define PIXMAN_COMPOSITE_ARGS(info)					\
+    MAYBE_UNUSED pixman_op_t        op = info->op;			\
+    MAYBE_UNUSED pixman_image_t *   src_image = info->src_image;	\
+    MAYBE_UNUSED pixman_image_t *   mask_image = info->mask_image;	\
+    MAYBE_UNUSED pixman_image_t *   dest_image = info->dest_image;	\
+    MAYBE_UNUSED int32_t            src_x = info->src_x;		\
+    MAYBE_UNUSED int32_t            src_y = info->src_y;		\
+    MAYBE_UNUSED int32_t            mask_x = info->mask_x;		\
+    MAYBE_UNUSED int32_t            mask_y = info->mask_y;		\
+    MAYBE_UNUSED int32_t            dest_x = info->dest_x;		\
+    MAYBE_UNUSED int32_t            dest_y = info->dest_y;		\
+    MAYBE_UNUSED int32_t            width = info->width;		\
+    MAYBE_UNUSED int32_t            height = info->height
+
 typedef void (*pixman_combine_32_func_t) (pixman_implementation_t *imp,
 					  pixman_op_t              op,
 					  uint32_t *               dest,
@@ -359,26 +423,15 @@ typedef void (*pixman_combine_32_func_t) (pixman_implementation_t *imp,
 					  const uint32_t *         mask,
 					  int                      width);
 
-typedef void (*pixman_combine_64_func_t) (pixman_implementation_t *imp,
-					  pixman_op_t              op,
-					  uint64_t *               dest,
-					  const uint64_t *         src,
-					  const uint64_t *         mask,
-					  int                      width);
+typedef void (*pixman_combine_float_func_t) (pixman_implementation_t *imp,
+					     pixman_op_t	      op,
+					     float *		      dest,
+					     const float *	      src,
+					     const float *	      mask,
+					     int		      n_pixels);
 
 typedef void (*pixman_composite_func_t) (pixman_implementation_t *imp,
-					 pixman_op_t              op,
-					 pixman_image_t *         src,
-					 pixman_image_t *         mask,
-					 pixman_image_t *         dest,
-					 int32_t                  src_x,
-					 int32_t                  src_y,
-					 int32_t                  mask_x,
-					 int32_t                  mask_y,
-					 int32_t                  dest_x,
-					 int32_t                  dest_y,
-					 int32_t                  width,
-					 int32_t                  height);
+					 pixman_composite_info_t *info);
 typedef pixman_bool_t (*pixman_blt_func_t) (pixman_implementation_t *imp,
 					    uint32_t *               src_bits,
 					    uint32_t *               dst_bits,
@@ -388,8 +441,8 @@ typedef pixman_bool_t (*pixman_blt_func_t) (pixman_implementation_t *imp,
 					    int                      dst_bpp,
 					    int                      src_x,
 					    int                      src_y,
-					    int                      dst_x,
-					    int                      dst_y,
+					    int                      dest_x,
+					    int                      dest_y,
 					    int                      width,
 					    int                      height);
 typedef pixman_bool_t (*pixman_fill_func_t) (pixman_implementation_t *imp,
@@ -400,10 +453,12 @@ typedef pixman_bool_t (*pixman_fill_func_t) (pixman_implementation_t *imp,
 					     int                      y,
 					     int                      width,
 					     int                      height,
-					     uint32_t                 xor);
+					     uint32_t                 filler);
+typedef pixman_bool_t (*pixman_iter_init_func_t) (pixman_implementation_t *imp,
+						  pixman_iter_t           *iter);
 
 void _pixman_setup_combiner_functions_32 (pixman_implementation_t *imp);
-void _pixman_setup_combiner_functions_64 (pixman_implementation_t *imp);
+void _pixman_setup_combiner_functions_float (pixman_implementation_t *imp);
 
 typedef struct
 {
@@ -420,50 +475,46 @@ typedef struct
 struct pixman_implementation_t
 {
     pixman_implementation_t *	toplevel;
-    pixman_implementation_t *	delegate;
+    pixman_implementation_t *	fallback;
     const pixman_fast_path_t *	fast_paths;
 
     pixman_blt_func_t		blt;
     pixman_fill_func_t		fill;
+    pixman_iter_init_func_t     src_iter_init;
+    pixman_iter_init_func_t     dest_iter_init;
 
     pixman_combine_32_func_t	combine_32[PIXMAN_N_OPERATORS];
     pixman_combine_32_func_t	combine_32_ca[PIXMAN_N_OPERATORS];
-    pixman_combine_64_func_t	combine_64[PIXMAN_N_OPERATORS];
-    pixman_combine_64_func_t	combine_64_ca[PIXMAN_N_OPERATORS];
+    pixman_combine_float_func_t	combine_float[PIXMAN_N_OPERATORS];
+    pixman_combine_float_func_t	combine_float_ca[PIXMAN_N_OPERATORS];
 };
 
+uint32_t
+_pixman_image_get_solid (pixman_implementation_t *imp,
+			 pixman_image_t *         image,
+                         pixman_format_code_t     format);
+
 pixman_implementation_t *
-_pixman_implementation_create (pixman_implementation_t *delegate,
+_pixman_implementation_create (pixman_implementation_t *fallback,
 			       const pixman_fast_path_t *fast_paths);
 
 void
-_pixman_implementation_combine_32 (pixman_implementation_t *imp,
-                                   pixman_op_t              op,
-                                   uint32_t *               dest,
-                                   const uint32_t *         src,
-                                   const uint32_t *         mask,
-                                   int                      width);
-void
-_pixman_implementation_combine_64 (pixman_implementation_t *imp,
-                                   pixman_op_t              op,
-                                   uint64_t *               dest,
-                                   const uint64_t *         src,
-                                   const uint64_t *         mask,
-                                   int                      width);
-void
-_pixman_implementation_combine_32_ca (pixman_implementation_t *imp,
-                                      pixman_op_t              op,
-                                      uint32_t *               dest,
-                                      const uint32_t *         src,
-                                      const uint32_t *         mask,
-                                      int                      width);
-void
-_pixman_implementation_combine_64_ca (pixman_implementation_t *imp,
-                                      pixman_op_t              op,
-                                      uint64_t *               dest,
-                                      const uint64_t *         src,
-                                      const uint64_t *         mask,
-                                      int                      width);
+_pixman_implementation_lookup_composite (pixman_implementation_t  *toplevel,
+					 pixman_op_t               op,
+					 pixman_format_code_t      src_format,
+					 uint32_t                  src_flags,
+					 pixman_format_code_t      mask_format,
+					 uint32_t                  mask_flags,
+					 pixman_format_code_t      dest_format,
+					 uint32_t                  dest_flags,
+					 pixman_implementation_t **out_imp,
+					 pixman_composite_func_t  *out_func);
+
+pixman_combine_32_func_t
+_pixman_implementation_lookup_combiner (pixman_implementation_t *imp,
+					pixman_op_t		 op,
+					pixman_bool_t		 component_alpha,
+					pixman_bool_t		 wide);
 
 pixman_bool_t
 _pixman_implementation_blt (pixman_implementation_t *imp,
@@ -475,8 +526,8 @@ _pixman_implementation_blt (pixman_implementation_t *imp,
                             int                      dst_bpp,
                             int                      src_x,
                             int                      src_y,
-                            int                      dst_x,
-                            int                      dst_y,
+                            int                      dest_x,
+                            int                      dest_y,
                             int                      width,
                             int                      height);
 
@@ -489,48 +540,112 @@ _pixman_implementation_fill (pixman_implementation_t *imp,
                              int                      y,
                              int                      width,
                              int                      height,
-                             uint32_t                 xor);
+                             uint32_t                 filler);
+
+pixman_bool_t
+_pixman_implementation_src_iter_init (pixman_implementation_t       *imp,
+				      pixman_iter_t                 *iter,
+				      pixman_image_t                *image,
+				      int                            x,
+				      int                            y,
+				      int                            width,
+				      int                            height,
+				      uint8_t                       *buffer,
+				      iter_flags_t                   flags,
+				      uint32_t                       image_flags);
+
+pixman_bool_t
+_pixman_implementation_dest_iter_init (pixman_implementation_t       *imp,
+				       pixman_iter_t                 *iter,
+				       pixman_image_t                *image,
+				       int                            x,
+				       int                            y,
+				       int                            width,
+				       int                            height,
+				       uint8_t                       *buffer,
+				       iter_flags_t                   flags,
+				       uint32_t                       image_flags);
 
 /* Specific implementations */
 pixman_implementation_t *
 _pixman_implementation_create_general (void);
 
 pixman_implementation_t *
-_pixman_implementation_create_fast_path (void);
+_pixman_implementation_create_fast_path (pixman_implementation_t *fallback);
 
-#ifdef USE_MMX
 pixman_implementation_t *
-_pixman_implementation_create_mmx (void);
+_pixman_implementation_create_noop (pixman_implementation_t *fallback);
+
+#if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
+pixman_implementation_t *
+_pixman_implementation_create_mmx (pixman_implementation_t *fallback);
 #endif
 
 #ifdef USE_SSE2
 pixman_implementation_t *
-_pixman_implementation_create_sse2 (void);
+_pixman_implementation_create_sse2 (pixman_implementation_t *fallback);
 #endif
 
 #ifdef USE_ARM_SIMD
 pixman_implementation_t *
-_pixman_implementation_create_arm_simd (void);
+_pixman_implementation_create_arm_simd (pixman_implementation_t *fallback);
 #endif
 
 #ifdef USE_ARM_NEON
 pixman_implementation_t *
-_pixman_implementation_create_arm_neon (void);
+_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback);
+#endif
+
+#ifdef USE_MIPS_DSPR2
+pixman_implementation_t *
+_pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback);
 #endif
 
 #ifdef USE_VMX
 pixman_implementation_t *
-_pixman_implementation_create_vmx (void);
+_pixman_implementation_create_vmx (pixman_implementation_t *fallback);
 #endif
 
+pixman_bool_t
+_pixman_implementation_disabled (const char *name);
+
+pixman_implementation_t *
+_pixman_x86_get_implementations (pixman_implementation_t *imp);
+
+pixman_implementation_t *
+_pixman_arm_get_implementations (pixman_implementation_t *imp);
+
+pixman_implementation_t *
+_pixman_ppc_get_implementations (pixman_implementation_t *imp);
+
+pixman_implementation_t *
+_pixman_mips_get_implementations (pixman_implementation_t *imp);
+
 pixman_implementation_t *
 _pixman_choose_implementation (void);
 
+pixman_bool_t
+_pixman_disabled (const char *name);
 
 
 /*
  * Utilities
  */
+pixman_bool_t
+_pixman_compute_composite_region32 (pixman_region32_t * region,
+				    pixman_image_t *    src_image,
+				    pixman_image_t *    mask_image,
+				    pixman_image_t *    dest_image,
+				    int32_t             src_x,
+				    int32_t             src_y,
+				    int32_t             mask_x,
+				    int32_t             mask_y,
+				    int32_t             dest_x,
+				    int32_t             dest_y,
+				    int32_t             width,
+				    int32_t             height);
+uint32_t *
+_pixman_iter_get_scanline_noop (pixman_iter_t *iter, const uint32_t *mask);
 
 /* These "formats" all have depth 0, so they
  * will never clash with any real ones
@@ -558,14 +673,19 @@ _pixman_choose_implementation (void);
 #define FAST_PATH_NEAREST_FILTER		(1 << 11)
 #define FAST_PATH_HAS_TRANSFORM			(1 << 12)
 #define FAST_PATH_IS_OPAQUE			(1 << 13)
-#define FAST_PATH_NEEDS_WORKAROUND		(1 << 14)
+#define FAST_PATH_NO_NORMAL_REPEAT		(1 << 14)
 #define FAST_PATH_NO_NONE_REPEAT		(1 << 15)
-#define FAST_PATH_SAMPLES_COVER_CLIP		(1 << 16)
-#define FAST_PATH_X_UNIT_POSITIVE		(1 << 17)
-#define FAST_PATH_AFFINE_TRANSFORM		(1 << 18)
-#define FAST_PATH_Y_UNIT_ZERO			(1 << 19)
-#define FAST_PATH_BILINEAR_FILTER		(1 << 20)
-#define FAST_PATH_NO_NORMAL_REPEAT		(1 << 21)
+#define FAST_PATH_X_UNIT_POSITIVE		(1 << 16)
+#define FAST_PATH_AFFINE_TRANSFORM		(1 << 17)
+#define FAST_PATH_Y_UNIT_ZERO			(1 << 18)
+#define FAST_PATH_BILINEAR_FILTER		(1 << 19)
+#define FAST_PATH_ROTATE_90_TRANSFORM		(1 << 20)
+#define FAST_PATH_ROTATE_180_TRANSFORM		(1 << 21)
+#define FAST_PATH_ROTATE_270_TRANSFORM		(1 << 22)
+#define FAST_PATH_SAMPLES_COVER_CLIP_NEAREST	(1 << 23)
+#define FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR	(1 << 24)
+#define FAST_PATH_BITS_IMAGE			(1 << 25)
+#define FAST_PATH_SEPARABLE_CONVOLUTION_FILTER  (1 << 26)
 
 #define FAST_PATH_PAD_REPEAT						\
     (FAST_PATH_NO_NONE_REPEAT		|				\
@@ -601,7 +721,7 @@ _pixman_choose_implementation (void);
 #define SOURCE_FLAGS(format)						\
     (FAST_PATH_STANDARD_FLAGS |						\
      ((PIXMAN_ ## format == PIXMAN_solid) ?				\
-      0 : (FAST_PATH_SAMPLES_COVER_CLIP | FAST_PATH_ID_TRANSFORM)))
+      0 : (FAST_PATH_SAMPLES_COVER_CLIP_NEAREST | FAST_PATH_NEAREST_FILTER | FAST_PATH_ID_TRANSFORM)))
 
 #define MASK_FLAGS(format, extra)					\
     ((PIXMAN_ ## format == PIXMAN_null) ? 0 : (SOURCE_FLAGS (format) | extra))
@@ -632,6 +752,24 @@ _pixman_choose_implementation (void);
 	    dest, FAST_PATH_STD_DEST_FLAGS,				\
 	    func) }
 
+extern pixman_implementation_t *global_implementation;
+
+static force_inline pixman_implementation_t *
+get_implementation (void)
+{
+#ifndef TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR
+    if (!global_implementation)
+	global_implementation = _pixman_choose_implementation ();
+#endif
+    return global_implementation;
+}
+
+/* This function is exported for the sake of the test suite and not part
+ * of the ABI.
+ */
+PIXMAN_EXPORT pixman_implementation_t *
+_pixman_internal_only_get_implementation (void);
+
 /* Memory allocation helpers */
 void *
 pixman_malloc_ab (unsigned int n, unsigned int b);
@@ -640,23 +778,25 @@ void *
 pixman_malloc_abc (unsigned int a, unsigned int b, unsigned int c);
 
 pixman_bool_t
-pixman_multiply_overflows_int (unsigned int a, unsigned int b);
+_pixman_multiply_overflows_size (size_t a, size_t b);
 
 pixman_bool_t
-pixman_addition_overflows_int (unsigned int a, unsigned int b);
+_pixman_multiply_overflows_int (unsigned int a, unsigned int b);
+
+pixman_bool_t
+_pixman_addition_overflows_int (unsigned int a, unsigned int b);
 
 /* Compositing utilities */
 void
-pixman_expand (uint64_t *           dst,
-               const uint32_t *     src,
-               pixman_format_code_t format,
-               int                  width);
+pixman_expand_to_float (argb_t               *dst,
+			const uint32_t       *src,
+			pixman_format_code_t  format,
+			int                   width);
 
 void
-pixman_contract (uint32_t *      dst,
-                 const uint64_t *src,
-                 int             width);
-
+pixman_contract_from_float (uint32_t     *dst,
+			    const argb_t *src,
+			    int           width);
 
 /* Region Helpers */
 pixman_bool_t
@@ -667,6 +807,50 @@ pixman_bool_t
 pixman_region16_copy_from_region32 (pixman_region16_t *dst,
                                     pixman_region32_t *src);
 
+/* Doubly linked lists */
+typedef struct pixman_link_t pixman_link_t;
+struct pixman_link_t
+{
+    pixman_link_t *next;
+    pixman_link_t *prev;
+};
+
+typedef struct pixman_list_t pixman_list_t;
+struct pixman_list_t
+{
+    pixman_link_t *head;
+    pixman_link_t *tail;
+};
+
+static force_inline void
+pixman_list_init (pixman_list_t *list)
+{
+    list->head = (pixman_link_t *)list;
+    list->tail = (pixman_link_t *)list;
+}
+
+static force_inline void
+pixman_list_prepend (pixman_list_t *list, pixman_link_t *link)
+{
+    link->next = list->head;
+    link->prev = (pixman_link_t *)list;
+    list->head->prev = link;
+    list->head = link;
+}
+
+static force_inline void
+pixman_list_unlink (pixman_link_t *link)
+{
+    link->prev->next = link->next;
+    link->next->prev = link->prev;
+}
+
+static force_inline void
+pixman_list_move_to_front (pixman_list_t *list, pixman_link_t *link)
+{
+    pixman_list_unlink (link);
+    pixman_list_prepend (list, link);
+}
 
 /* Misc macros */
 
@@ -696,29 +880,62 @@ pixman_region16_copy_from_region32 (pixman_region16_t *dst,
 
 #define CLIP(v, low, high) ((v) < (low) ? (low) : ((v) > (high) ? (high) : (v)))
 
+#define FLOAT_IS_ZERO(f)     (-FLT_MIN < (f) && (f) < FLT_MIN)
+
 /* Conversion between 8888 and 0565 */
 
-#define CONVERT_8888_TO_0565(s)						\
-    ((((s) >> 3) & 0x001f) |						\
-     (((s) >> 5) & 0x07e0) |						\
-     (((s) >> 8) & 0xf800))
+static force_inline uint16_t
+convert_8888_to_0565 (uint32_t s)
+{
+    /* The following code can be compiled into just 4 instructions on ARM */
+    uint32_t a, b;
+    a = (s >> 3) & 0x1F001F;
+    b = s & 0xFC00;
+    a |= a >> 5;
+    a |= b >> 5;
+    return (uint16_t)a;
+}
 
-#define CONVERT_0565_TO_0888(s)						\
-    (((((s) << 3) & 0xf8) | (((s) >> 2) & 0x7)) |			\
-     ((((s) << 5) & 0xfc00) | (((s) >> 1) & 0x300)) |			\
-     ((((s) << 8) & 0xf80000) | (((s) << 3) & 0x70000)))
+static force_inline uint32_t
+convert_0565_to_0888 (uint16_t s)
+{
+    return (((((s) << 3) & 0xf8) | (((s) >> 2) & 0x7)) |
+            ((((s) << 5) & 0xfc00) | (((s) >> 1) & 0x300)) |
+            ((((s) << 8) & 0xf80000) | (((s) << 3) & 0x70000)));
+}
 
-#define CONVERT_0565_TO_8888(s) (CONVERT_0565_TO_0888(s) | 0xff000000)
+static force_inline uint32_t
+convert_0565_to_8888 (uint16_t s)
+{
+    return convert_0565_to_0888 (s) | 0xff000000;
+}
 
 /* Trivial versions that are useful in macros */
-#define CONVERT_8888_TO_8888(s) (s)
-#define CONVERT_0565_TO_0565(s) (s)
+
+static force_inline uint32_t
+convert_8888_to_8888 (uint32_t s)
+{
+    return s;
+}
+
+static force_inline uint32_t
+convert_x888_to_8888 (uint32_t s)
+{
+    return s | 0xff000000;
+}
+
+static force_inline uint16_t
+convert_0565_to_0565 (uint16_t s)
+{
+    return s;
+}
 
 #define PIXMAN_FORMAT_IS_WIDE(f)					\
     (PIXMAN_FORMAT_A (f) > 8 ||						\
      PIXMAN_FORMAT_R (f) > 8 ||						\
      PIXMAN_FORMAT_G (f) > 8 ||						\
-     PIXMAN_FORMAT_B (f) > 8)
+     PIXMAN_FORMAT_B (f) > 8 ||						\
+     PIXMAN_FORMAT_TYPE (f) == PIXMAN_TYPE_ARGB_SRGB)
 
 #ifdef WORDS_BIGENDIAN
 #   define SCREEN_SHIFT_LEFT(x,n)	((x) << (n))
@@ -728,6 +945,52 @@ pixman_region16_copy_from_region32 (pixman_region16_t *dst,
 #   define SCREEN_SHIFT_RIGHT(x,n)	((x) << (n))
 #endif
 
+static force_inline uint32_t
+unorm_to_unorm (uint32_t val, int from_bits, int to_bits)
+{
+    uint32_t result;
+
+    if (from_bits == 0)
+	return 0;
+
+    /* Delete any extra bits */
+    val &= ((1 << from_bits) - 1);
+
+    if (from_bits >= to_bits)
+	return val >> (from_bits - to_bits);
+
+    /* Start out with the high bit of val in the high bit of result. */
+    result = val << (to_bits - from_bits);
+
+    /* Copy the bits in result, doubling the number of bits each time, until
+     * we fill all to_bits. Unrolled manually because from_bits and to_bits
+     * are usually known statically, so the compiler can turn all of this
+     * into a few shifts.
+     */
+#define REPLICATE()							\
+    do									\
+    {									\
+	if (from_bits < to_bits)					\
+	{								\
+	    result |= result >> from_bits;				\
+									\
+	    from_bits *= 2;						\
+	}								\
+    }									\
+    while (0)
+
+    REPLICATE();
+    REPLICATE();
+    REPLICATE();
+    REPLICATE();
+    REPLICATE();
+
+    return result;
+}
+
+uint16_t pixman_float_to_unorm (float f, int n_bits);
+float pixman_unorm_to_float (uint16_t u, int n_bits);
+
 /*
  * Various debugging code
  */
@@ -754,15 +1017,13 @@ pixman_region16_copy_from_region32 (pixman_region16_t *dst,
 
 #endif
 
-#ifdef DEBUG
-
 void
 _pixman_log_error (const char *function, const char *message);
 
 #define return_if_fail(expr)                                            \
     do                                                                  \
     {                                                                   \
-	if (!(expr))							\
+	if (unlikely (!(expr)))                                         \
 	{								\
 	    _pixman_log_error (FUNC, "The expression " # expr " was false"); \
 	    return;							\
@@ -773,7 +1034,7 @@ _pixman_log_error (const char *function, const char *message);
 #define return_val_if_fail(expr, retval)                                \
     do                                                                  \
     {                                                                   \
-	if (!(expr))                                                    \
+	if (unlikely (!(expr)))                                         \
 	{								\
 	    _pixman_log_error (FUNC, "The expression " # expr " was false"); \
 	    return (retval);						\
@@ -784,38 +1045,31 @@ _pixman_log_error (const char *function, const char *message);
 #define critical_if_fail(expr)						\
     do									\
     {									\
-	if (!(expr))							\
+	if (unlikely (!(expr)))                                         \
 	    _pixman_log_error (FUNC, "The expression " # expr " was false"); \
     }									\
     while (0)
 
+/*
+ * Matrix
+ */
 
-#else
+typedef struct { pixman_fixed_48_16_t v[3]; } pixman_vector_48_16_t;
 
-#define _pixman_log_error(f,m) do { } while (0)				\
+pixman_bool_t
+pixman_transform_point_31_16 (const pixman_transform_t    *t,
+                              const pixman_vector_48_16_t *v,
+                              pixman_vector_48_16_t       *result);
 
-#define return_if_fail(expr)						\
-    do                                                                  \
-    {                                                                   \
-	if (!(expr))							\
-	    return;							\
-    }                                                                   \
-    while (0)
+void
+pixman_transform_point_31_16_3d (const pixman_transform_t    *t,
+                                 const pixman_vector_48_16_t *v,
+                                 pixman_vector_48_16_t       *result);
 
-#define return_val_if_fail(expr, retval)                                \
-    do                                                                  \
-    {                                                                   \
-	if (!(expr))							\
-	    return (retval);						\
-    }                                                                   \
-    while (0)
-
-#define critical_if_fail(expr)						\
-    do									\
-    {									\
-    }									\
-    while (0)
-#endif
+void
+pixman_transform_point_31_16_affine (const pixman_transform_t    *t,
+                                     const pixman_vector_48_16_t *v,
+                                     pixman_vector_48_16_t       *result);
 
 /*
  * Timers
@@ -826,10 +1080,11 @@ _pixman_log_error (const char *function, const char *message);
 static inline uint64_t
 oil_profile_stamp_rdtsc (void)
 {
-    uint64_t ts;
+    uint32_t hi, lo;
 
-    __asm__ __volatile__ ("rdtsc\n" : "=A" (ts));
-    return ts;
+    __asm__ __volatile__ ("rdtsc\n" : "=a" (lo), "=d" (hi));
+
+    return lo | (((uint64_t)hi) << 32);
 }
 
 #define OIL_STAMP oil_profile_stamp_rdtsc
@@ -868,6 +1123,13 @@ void pixman_timer_register (pixman_timer_t *timer);
     timer ## tname.total += OIL_STAMP () - begin ## tname;		\
     }
 
+#else
+
+#define TIMER_BEGIN(tname)
+#define TIMER_END(tname)
+
 #endif /* PIXMAN_TIMERS */
 
+#endif /* __ASSEMBLER__ */
+
 #endif /* PIXMAN_PRIVATE_H */
diff --git a/programs/develop/libraries/pixman/pixman-radial-gradient.c b/programs/develop/libraries/pixman/pixman-radial-gradient.c
index fa5725840f..6a217963da 100644
--- a/programs/develop/libraries/pixman/pixman-radial-gradient.c
+++ b/programs/develop/libraries/pixman/pixman-radial-gradient.c
@@ -78,11 +78,11 @@ radial_compute_color (double                    a,
 {
     /*
      * In this function error propagation can lead to bad results:
-     *  - det can have an unbound error (if b*b-a*c is very small),
+     *  - discr can have an unbound error (if b*b-a*c is very small),
      *    potentially making it the opposite sign of what it should have been
      *    (thus clearing a pixel that would have been colored or vice-versa)
-     *    or propagating the error to sqrtdet;
-     *    if det has the wrong sign or b is very small, this can lead to bad
+     *    or propagating the error to sqrtdiscr;
+     *    if discr has the wrong sign or b is very small, this can lead to bad
      *    results
      *
      *  - the algorithm used to compute the solutions of the quadratic
@@ -92,7 +92,7 @@ radial_compute_color (double                    a,
      *
      *  - the above problems are worse if a is small (as inva becomes bigger)
      */
-    double det;
+    double discr;
 
     if (a == 0)
     {
@@ -109,22 +109,33 @@ radial_compute_color (double                    a,
 	}
 	else
 	{
-	    if (t * dr > mindr)
+	    if (t * dr >= mindr)
 		return _pixman_gradient_walker_pixel (walker, t);
 	}
 
 	return 0;
     }
 
-    det = fdot (b, a, 0, b, -c, 0);
-    if (det >= 0)
+    discr = fdot (b, a, 0, b, -c, 0);
+    if (discr >= 0)
     {
-	double sqrtdet, t0, t1;
+	double sqrtdiscr, t0, t1;
 
-	sqrtdet = sqrt (det);
-	t0 = (b + sqrtdet) * inva;
-	t1 = (b - sqrtdet) * inva;
+	sqrtdiscr = sqrt (discr);
+	t0 = (b + sqrtdiscr) * inva;
+	t1 = (b - sqrtdiscr) * inva;
 
+	/*
+	 * The root that must be used is the biggest one that belongs
+	 * to the valid range ([0,1] for PIXMAN_REPEAT_NONE, any
+	 * solution that results in a positive radius otherwise).
+	 *
+	 * If a > 0, t0 is the biggest solution, so if it is valid, it
+	 * is the correct result.
+	 *
+	 * If a < 0, only one of the solutions can be valid, so the
+	 * order in which they are tested is not important.
+	 */
 	if (repeat == PIXMAN_REPEAT_NONE)
 	{
 	    if (0 <= t0 && t0 <= pixman_fixed_1)
@@ -134,9 +145,9 @@ radial_compute_color (double                    a,
 	}
 	else
 	{
-	    if (t0 * dr > mindr)
+	    if (t0 * dr >= mindr)
 		return _pixman_gradient_walker_pixel (walker, t0);
-	    else if (t1 * dr > mindr)
+	    else if (t1 * dr >= mindr)
 		return _pixman_gradient_walker_pixel (walker, t1);
 	}
     }
@@ -144,19 +155,14 @@ radial_compute_color (double                    a,
     return 0;
 }
 
-static void
-radial_gradient_get_scanline_32 (pixman_image_t *image,
-                                 int             x,
-                                 int             y,
-                                 int             width,
-                                 uint32_t *      buffer,
-                                 const uint32_t *mask)
+static uint32_t *
+radial_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
 {
     /*
      * Implementation of radial gradients following the PDF specification.
      * See section 8.7.4.5.4 Type 3 (Radial) Shadings of the PDF Reference
      * Manual (PDF 32000-1:2008 at the time of this writing).
-     * 
+     *
      * In the radial gradient problem we are given two circles (c₁,r₁) and
      * (c₂,r₂) that define the gradient itself.
      *
@@ -173,7 +179,7 @@ radial_gradient_get_scanline_32 (pixman_image_t *image,
      *
      * The graphical result is the same as drawing the valid (radius > 0)
      * circles with increasing t in [-inf, +inf] (or in [0,1] if the gradient
-     * is not repeated) using SOURCE operatior composition.
+     * is not repeated) using SOURCE operator composition.
      *
      * It looks like a cone pointing towards the viewer if the ending circle
      * is smaller than the starting one, a cone pointing inside the page if
@@ -184,14 +190,14 @@ radial_gradient_get_scanline_32 (pixman_image_t *image,
      * in, compute the t values for that point, solving for t in:
      *
      *     length((1-t)·c₁ + t·(c₂) - p) = (1-t)·r₁ + t·r₂
-     * 
+     *
      * Let's rewrite it in a simpler way, by defining some auxiliary
      * variables:
      *
      *     cd = c₂ - c₁
      *     pd = p - c₁
      *     dr = r₂ - r₁
-     *     lenght(t·cd - pd) = r₁ + t·dr
+     *     length(t·cd - pd) = r₁ + t·dr
      *
      * which actually means
      *
@@ -217,7 +223,7 @@ radial_gradient_get_scanline_32 (pixman_image_t *image,
      *     B = pdx·cdx + pdy·cdy + r₁·dr
      *     C = pdx² + pdy² - r₁²
      *     At² - 2Bt + C = 0
-     * 
+     *
      * The solutions (unless the equation degenerates because of A = 0) are:
      *
      *     t = (B ± ⎷(B² - A·C)) / A
@@ -233,9 +239,13 @@ radial_gradient_get_scanline_32 (pixman_image_t *image,
      *   <=> for every p, the radiuses associated with the two t solutions
      *       have opposite sign
      */
+    pixman_image_t *image = iter->image;
+    int x = iter->x;
+    int y = iter->y;
+    int width = iter->width;
+    uint32_t *buffer = iter->buffer;
 
     gradient_t *gradient = (gradient_t *)image;
-    source_image_t *source = (source_image_t *)image;
     radial_gradient_t *radial = (radial_gradient_t *)image;
     uint32_t *end = buffer + width;
     pixman_gradient_walker_t walker;
@@ -246,16 +256,16 @@ radial_gradient_get_scanline_32 (pixman_image_t *image,
     v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
     v.vector[2] = pixman_fixed_1;
 
-    _pixman_gradient_walker_init (&walker, gradient, source->common.repeat);
+    _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
 
-    if (source->common.transform)
+    if (image->common.transform)
     {
-	if (!pixman_transform_point_3d (source->common.transform, &v))
-	    return;
-	
-	unit.vector[0] = source->common.transform->matrix[0][0];
-	unit.vector[1] = source->common.transform->matrix[1][0];
-	unit.vector[2] = source->common.transform->matrix[2][0];
+	if (!pixman_transform_point_3d (image->common.transform, &v))
+	    return iter->buffer;
+
+	unit.vector[0] = image->common.transform->matrix[0][0];
+	unit.vector[1] = image->common.transform->matrix[1][0];
+	unit.vector[2] = image->common.transform->matrix[2][0];
     }
     else
     {
@@ -325,7 +335,7 @@ radial_gradient_get_scanline_32 (pixman_image_t *image,
 						radial->delta.radius,
 						radial->mindr,
 						&walker,
-						source->common.repeat);
+						image->common.repeat);
 	    }
 
 	    b += db;
@@ -370,14 +380,14 @@ radial_gradient_get_scanline_32 (pixman_image_t *image,
 						    radial->delta.radius,
 						    radial->mindr,
 						    &walker,
-						    source->common.repeat);
+						    image->common.repeat);
 		}
 		else
 		{
 		    *buffer = 0;
 		}
 	    }
-	    
+
 	    ++buffer;
 
 	    v.vector[0] += unit.vector[0];
@@ -385,18 +395,34 @@ radial_gradient_get_scanline_32 (pixman_image_t *image,
 	    v.vector[2] += unit.vector[2];
 	}
     }
+
+    iter->y++;
+    return iter->buffer;
 }
 
-static void
-radial_gradient_property_changed (pixman_image_t *image)
+static uint32_t *
+radial_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
 {
-    image->common.get_scanline_32 = radial_gradient_get_scanline_32;
-    image->common.get_scanline_64 = _pixman_image_get_scanline_generic_64;
+    uint32_t *buffer = radial_get_scanline_narrow (iter, NULL);
+
+    pixman_expand_to_float (
+	(argb_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+
+    return buffer;
+}
+
+void
+_pixman_radial_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+    if (iter->iter_flags & ITER_NARROW)
+	iter->get_scanline = radial_get_scanline_narrow;
+    else
+	iter->get_scanline = radial_get_scanline_wide;
 }
 
 PIXMAN_EXPORT pixman_image_t *
-pixman_image_create_radial_gradient (pixman_point_fixed_t *        inner,
-                                     pixman_point_fixed_t *        outer,
+pixman_image_create_radial_gradient (const pixman_point_fixed_t *  inner,
+                                     const pixman_point_fixed_t *  outer,
                                      pixman_fixed_t                inner_radius,
                                      pixman_fixed_t                outer_radius,
                                      const pixman_gradient_stop_t *stops,
@@ -441,8 +467,5 @@ pixman_image_create_radial_gradient (pixman_point_fixed_t *        inner,
 
     radial->mindr = -1. * pixman_fixed_1 * radial->c1.radius;
 
-    image->common.property_changed = radial_gradient_property_changed;
-
     return image;
 }
-
diff --git a/programs/develop/libraries/pixman/pixman-region.c b/programs/develop/libraries/pixman/pixman-region.c
index 4e7c8db75e..59bc9c7971 100644
--- a/programs/develop/libraries/pixman/pixman-region.c
+++ b/programs/develop/libraries/pixman/pixman-region.c
@@ -102,7 +102,11 @@
 
 static const box_type_t PREFIX (_empty_box_) = { 0, 0, 0, 0 };
 static const region_data_type_t PREFIX (_empty_data_) = { 0, 0 };
+#if defined (__llvm__) && !defined (__clang__)
+static const volatile region_data_type_t PREFIX (_broken_data_) = { 0, 0 };
+#else
 static const region_data_type_t PREFIX (_broken_data_) = { 0, 0 };
+#endif
 
 static box_type_t *pixman_region_empty_box =
     (box_type_t *)&PREFIX (_empty_box_);
@@ -198,7 +202,7 @@ PIXREGION_SZOF (size_t n)
     return size + sizeof(region_data_type_t);
 }
 
-static void *
+static region_data_type_t *
 alloc_data (size_t n)
 {
     size_t sz = PIXREGION_SZOF (n);
@@ -738,8 +742,7 @@ typedef pixman_bool_t (*overlap_proc_ptr) (region_type_t *region,
 					   box_type_t *   r2,
 					   box_type_t *   r2_end,
 					   int            y1,
-					   int            y2,
-					   int *          overlap);
+					   int            y2);
 
 static pixman_bool_t
 pixman_op (region_type_t *  new_reg,               /* Place to store result	    */
@@ -750,10 +753,10 @@ pixman_op (region_type_t *  new_reg,               /* Place to store result
 	   int              append_non1,           /* Append non-overlapping bands  
 						    * in region 1 ?
 						    */
-	   int              append_non2,           /* Append non-overlapping bands
+	   int              append_non2            /* Append non-overlapping bands
 						    * in region 2 ?
 						    */
-	   int *            overlap)
+    )
 {
     box_type_t *r1;                 /* Pointer into first region     */
     box_type_t *r2;                 /* Pointer into 2d region	     */
@@ -824,8 +827,7 @@ pixman_op (region_type_t *  new_reg,               /* Place to store result
     {
         if (!pixman_rect_alloc (new_reg, new_size))
         {
-            if (old_data)
-		free (old_data);
+            free (old_data);
             return FALSE;
 	}
     }
@@ -932,8 +934,7 @@ pixman_op (region_type_t *  new_reg,               /* Place to store result
             if (!(*overlap_func)(new_reg,
                                  r1, r1_band_end,
                                  r2, r2_band_end,
-                                 ytop, ybot,
-                                 overlap))
+                                 ytop, ybot))
 	    {
 		goto bail;
 	    }
@@ -1001,8 +1002,7 @@ pixman_op (region_type_t *  new_reg,               /* Place to store result
         APPEND_REGIONS (new_reg, r2_band_end, r2_end);
     }
 
-    if (old_data)
-	free (old_data);
+    free (old_data);
 
     if (!(numRects = new_reg->data->numRects))
     {
@@ -1023,8 +1023,7 @@ pixman_op (region_type_t *  new_reg,               /* Place to store result
     return TRUE;
 
 bail:
-    if (old_data)
-	free (old_data);
+    free (old_data);
 
     return pixman_break (new_reg);
 }
@@ -1112,8 +1111,7 @@ pixman_region_intersect_o (region_type_t *region,
                            box_type_t *   r2,
                            box_type_t *   r2_end,
                            int            y1,
-                           int            y2,
-                           int *          overlap)
+                           int            y2)
 {
     int x1;
     int x2;
@@ -1209,13 +1207,9 @@ PREFIX (_intersect) (region_type_t *     new_reg,
     else
     {
         /* General purpose intersection */
-        int overlap; /* result ignored */
 
-        if (!pixman_op (new_reg, reg1, reg2, pixman_region_intersect_o, FALSE, FALSE,
-                        &overlap))
-	{
+        if (!pixman_op (new_reg, reg1, reg2, pixman_region_intersect_o, FALSE, FALSE))
 	    return FALSE;
-	}
 	
         pixman_set_extents (new_reg);
     }
@@ -1230,9 +1224,6 @@ PREFIX (_intersect) (region_type_t *     new_reg,
         if (r->x1 <= x2)						\
 	{								\
             /* Merge with current rectangle */				\
-            if (r->x1 < x2)						\
-		*overlap = TRUE;					\
-									\
             if (x2 < r->x2)						\
 		x2 = r->x2;						\
 	}								\
@@ -1272,8 +1263,7 @@ pixman_region_union_o (region_type_t *region,
 		       box_type_t *   r2,
 		       box_type_t *   r2_end,
 		       int            y1,
-		       int            y2,
-		       int *          overlap)
+		       int            y2)
 {
     box_type_t *next_rect;
     int x1;            /* left and right side of current union */
@@ -1382,8 +1372,6 @@ PREFIX (_union) (region_type_t *new_reg,
                  region_type_t *reg1,
                  region_type_t *reg2)
 {
-    int overlap; /* result ignored */
-
     /* Return TRUE if some overlap
      * between reg1, reg2
      */
@@ -1449,7 +1437,7 @@ PREFIX (_union) (region_type_t *new_reg,
 	return TRUE;
     }
 
-    if (!pixman_op (new_reg, reg1, reg2, pixman_region_union_o, TRUE, TRUE, &overlap))
+    if (!pixman_op (new_reg, reg1, reg2, pixman_region_union_o, TRUE, TRUE))
 	return FALSE;
 
     new_reg->extents.x1 = MIN (reg1->extents.x1, reg2->extents.x1);
@@ -1516,9 +1504,7 @@ quick_sort_rects (
                 r++;
                 i++;
 	    }
-
-            while (i != numRects && (r->y1 < y1 || (r->y1 == y1 && r->x1 < x1)))
-		;
+	    while (i != numRects && (r->y1 < y1 || (r->y1 == y1 && r->x1 < x1)));
 
 	    r = &(rects[j]);
             do
@@ -1579,8 +1565,7 @@ quick_sort_rects (
  */
 
 static pixman_bool_t
-validate (region_type_t * badreg,
-          int *           overlap)
+validate (region_type_t * badreg)
 {
     /* Descriptor for regions under construction  in Step 2. */
     typedef struct
@@ -1605,7 +1590,6 @@ validate (region_type_t * badreg,
     region_type_t *hreg;            /* ri[j_half].reg			    */
     pixman_bool_t ret = TRUE;
 
-    *overlap = FALSE;
     if (!badreg->data)
     {
         GOOD (badreg);
@@ -1679,9 +1663,6 @@ validate (region_type_t * badreg,
                 if (box->x1 <= ri_box->x2)
                 {
                     /* Merge it with ri_box */
-                    if (box->x1 < ri_box->x2)
-			*overlap = TRUE;
-
                     if (box->x2 > ri_box->x2)
 			ri_box->x2 = box->x2;
 		}
@@ -1785,7 +1766,7 @@ validate (region_type_t * badreg,
             reg = &ri[j].reg;
             hreg = &ri[j + half].reg;
 
-            if (!pixman_op (reg, reg, hreg, pixman_region_union_o, TRUE, TRUE, overlap))
+            if (!pixman_op (reg, reg, hreg, pixman_region_union_o, TRUE, TRUE))
 		ret = FALSE;
 
             if (hreg->extents.x1 < reg->extents.x1)
@@ -1853,8 +1834,7 @@ pixman_region_subtract_o (region_type_t * region,
 			  box_type_t *    r2,
 			  box_type_t *    r2_end,
 			  int             y1,
-			  int             y2,
-			  int *           overlap)
+			  int             y2)
 {
     box_type_t *        next_rect;
     int x1;
@@ -1878,7 +1858,7 @@ pixman_region_subtract_o (region_type_t * region,
         else if (r2->x1 <= x1)
         {
             /*
-	     * Subtrahend preceeds minuend: nuke left edge of minuend.
+	     * Subtrahend precedes minuend: nuke left edge of minuend.
 	     */
             x1 = r2->x2;
             if (x1 >= r1->x2)
@@ -1978,8 +1958,6 @@ PREFIX (_subtract) (region_type_t *reg_d,
                     region_type_t *reg_m,
                     region_type_t *reg_s)
 {
-    int overlap; /* result ignored */
-
     GOOD (reg_m);
     GOOD (reg_s);
     GOOD (reg_d);
@@ -2004,9 +1982,9 @@ PREFIX (_subtract) (region_type_t *reg_d,
     }
 
     /* Add those rectangles in region 1 that aren't in region 2,
-       do yucky substraction for overlaps, and
+       do yucky subtraction for overlaps, and
        just throw away rectangles in region 2 that aren't in region 1 */
-    if (!pixman_op (reg_d, reg_m, reg_s, pixman_region_subtract_o, TRUE, FALSE, &overlap))
+    if (!pixman_op (reg_d, reg_m, reg_s, pixman_region_subtract_o, TRUE, FALSE))
 	return FALSE;
 
     /*
@@ -2040,15 +2018,13 @@ PREFIX (_subtract) (region_type_t *reg_d,
  *
  *-----------------------------------------------------------------------
  */
-pixman_bool_t
-PIXMAN_EXPORT PREFIX (_inverse) (region_type_t *new_reg,  /* Destination region */
-                                 region_type_t *reg1,     /* Region to invert */
-                                 box_type_t *   inv_rect) /* Bounding box for inversion */
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_inverse) (region_type_t *new_reg,  /* Destination region */
+		   region_type_t *reg1,     /* Region to invert */
+		   box_type_t *   inv_rect) /* Bounding box for inversion */
 {
     region_type_t inv_reg; /* Quick and dirty region made from the
 			    * bounding box */
-    int overlap;           /* result ignored */
-
     GOOD (reg1);
     GOOD (new_reg);
     
@@ -2066,12 +2042,12 @@ PIXMAN_EXPORT PREFIX (_inverse) (region_type_t *new_reg,  /* Destination region
     }
 
     /* Add those rectangles in region 1 that aren't in region 2,
-     * do yucky substraction for overlaps, and
+     * do yucky subtraction for overlaps, and
      * just throw away rectangles in region 2 that aren't in region 1
      */
     inv_reg.extents = *inv_rect;
     inv_reg.data = (region_data_type_t *)NULL;
-    if (!pixman_op (new_reg, &inv_reg, reg1, pixman_region_subtract_o, TRUE, FALSE, &overlap))
+    if (!pixman_op (new_reg, &inv_reg, reg1, pixman_region_subtract_o, TRUE, FALSE))
 	return FALSE;
 
     /*
@@ -2086,6 +2062,40 @@ PIXMAN_EXPORT PREFIX (_inverse) (region_type_t *new_reg,  /* Destination region
     return TRUE;
 }
 
+/* In time O(log n), locate the first box whose y2 is greater than y.
+ * Return @end if no such box exists.
+ */
+static box_type_t *
+find_box_for_y (box_type_t *begin, box_type_t *end, int y)
+{
+    box_type_t *mid;
+
+    if (end == begin)
+	return end;
+
+    if (end - begin == 1)
+    {
+	if (begin->y2 > y)
+	    return begin;
+	else
+	    return end;
+    }
+
+    mid = begin + (end - begin) / 2;
+    if (mid->y2 > y)
+    {
+	/* If no box is found in [begin, mid], the function
+	 * will return @mid, which is then known to be the
+	 * correct answer.
+	 */
+	return find_box_for_y (begin, mid, y);
+    }
+    else
+    {
+	return find_box_for_y (mid, end, y);
+    }
+}
+
 /*
  *   rect_in(region, rect)
  *   This routine takes a pointer to a region and a pointer to a box
@@ -2102,10 +2112,9 @@ PIXMAN_EXPORT PREFIX (_inverse) (region_type_t *new_reg,  /* Destination region
  *   partially in the region) or is outside the region (we reached a band
  *   that doesn't overlap the box at all and part_in is false)
  */
-
-pixman_region_overlap_t
-PIXMAN_EXPORT PREFIX (_contains_rectangle) (region_type_t *  region,
-                                            box_type_t *     prect)
+PIXMAN_EXPORT pixman_region_overlap_t
+PREFIX (_contains_rectangle) (region_type_t *  region,
+			      box_type_t *     prect)
 {
     box_type_t *     pbox;
     box_type_t *     pbox_end;
@@ -2139,12 +2148,15 @@ PIXMAN_EXPORT PREFIX (_contains_rectangle) (region_type_t *  region,
 
     /* can stop when both part_out and part_in are TRUE, or we reach prect->y2 */
     for (pbox = PIXREGION_BOXPTR (region), pbox_end = pbox + numRects;
-         pbox != pbox_end;
-         pbox++)
+	 pbox != pbox_end;
+	 pbox++)
     {
-
-        if (pbox->y2 <= y)
-	    continue;   /* getting up to speed or skipping remainder of band */
+	/* getting up to speed or skipping remainder of band */
+	if (pbox->y2 <= y)
+	{
+	    if ((pbox = find_box_for_y (pbox, pbox_end, y)) == pbox_end)
+		break;
+	}
 
         if (pbox->y1 > y)
         {
@@ -2319,6 +2331,16 @@ PREFIX (_reset) (region_type_t *region, box_type_t *box)
     region->data = NULL;
 }
 
+PIXMAN_EXPORT void
+PREFIX (_clear) (region_type_t *region)
+{
+    GOOD (region);
+    FREE_DATA (region);
+
+    region->extents = *pixman_region_empty_box;
+    region->data = pixman_region_empty_data;
+}
+
 /* box is "return" value */
 PIXMAN_EXPORT int
 PREFIX (_contains_point) (region_type_t * region,
@@ -2342,13 +2364,13 @@ PREFIX (_contains_point) (region_type_t * region,
         return(TRUE);
     }
 
-    for (pbox = PIXREGION_BOXPTR (region), pbox_end = pbox + numRects;
-	 pbox != pbox_end;
-	 pbox++)
-    {
-        if (y >= pbox->y2)
-	    continue;           /* not there yet */
+    pbox = PIXREGION_BOXPTR (region);
+    pbox_end = pbox + numRects;
 
+    pbox = find_box_for_y (pbox, pbox_end, y);
+
+    for (;pbox != pbox_end; pbox++)
+    {
         if ((y < pbox->y1) || (x < pbox->x1))
 	    break;              /* missed it */
 
@@ -2528,7 +2550,7 @@ PREFIX (_init_rects) (region_type_t *region,
     /* Validate */
     region->extents.x1 = region->extents.x2 = 0;
 
-    return validate (region, &i);
+    return validate (region);
 }
 
 #define READ(_ptr) (*(_ptr))
@@ -2545,8 +2567,7 @@ bitmap_addrect (region_type_t *reg,
 	   ((r-1)->y1 == ry1) && ((r-1)->y2 == ry2) &&
 	   ((r-1)->x1 <= rx1) && ((r-1)->x2 >= rx2))))
     {
-	if (!reg->data ||
-	    reg->data->numRects == reg->data->size)
+	if (reg->data->numRects == reg->data->size)
 	{
 	    if (!pixman_rect_alloc (reg, 1))
 		return NULL;
@@ -2590,6 +2611,8 @@ PREFIX (_init_from_image) (region_type_t *region,
 
     PREFIX(_init) (region);
 
+    critical_if_fail (region->data);
+
     return_if_fail (image->type == BITS);
     return_if_fail (image->bits.format == PIXMAN_a1);
 
diff --git a/programs/develop/libraries/pixman/pixman-solid-fill.c b/programs/develop/libraries/pixman/pixman-solid-fill.c
index 1d911e99d9..5f9fef6306 100644
--- a/programs/develop/libraries/pixman/pixman-solid-fill.c
+++ b/programs/develop/libraries/pixman/pixman-solid-fill.c
@@ -26,56 +26,6 @@
 #endif
 #include "pixman-private.h"
 
-static void
-solid_fill_get_scanline_32 (pixman_image_t *image,
-                            int             x,
-                            int             y,
-                            int             width,
-                            uint32_t *      buffer,
-                            const uint32_t *mask)
-{
-    uint32_t *end = buffer + width;
-    uint32_t color = image->solid.color_32;
-
-    while (buffer < end)
-	*(buffer++) = color;
-
-    return;
-}
-
-static void
-solid_fill_get_scanline_64 (pixman_image_t *image,
-			    int             x,
-			    int             y,
-			    int             width,
-			    uint32_t *      buffer,
-			    const uint32_t *mask)
-{
-    uint64_t *b = (uint64_t *)buffer;
-    uint64_t *e = b + width;
-    uint64_t color = image->solid.color_64;
-
-    while (b < e)
-	*(b++) = color;
-}
-
-static source_image_class_t
-solid_fill_classify (pixman_image_t *image,
-                     int             x,
-                     int             y,
-                     int             width,
-                     int             height)
-{
-    return SOURCE_IMAGE_CLASS_HORIZONTAL;
-}
-
-static void
-solid_fill_property_changed (pixman_image_t *image)
-{
-    image->common.get_scanline_32 = solid_fill_get_scanline_32;
-    image->common.get_scanline_64 = solid_fill_get_scanline_64;
-}
-
 static uint32_t
 color_to_uint32 (const pixman_color_t *color)
 {
@@ -86,18 +36,21 @@ color_to_uint32 (const pixman_color_t *color)
         (color->blue >> 8);
 }
 
-static uint64_t
-color_to_uint64 (const pixman_color_t *color)
+static argb_t
+color_to_float (const pixman_color_t *color)
 {
-    return
-        ((uint64_t)color->alpha << 48) |
-        ((uint64_t)color->red << 32) |
-        ((uint64_t)color->green << 16) |
-        ((uint64_t)color->blue);
+    argb_t result;
+
+    result.a = pixman_unorm_to_float (color->alpha, 16);
+    result.r = pixman_unorm_to_float (color->red, 16);
+    result.g = pixman_unorm_to_float (color->green, 16);
+    result.b = pixman_unorm_to_float (color->blue, 16);
+
+    return result;
 }
 
 PIXMAN_EXPORT pixman_image_t *
-pixman_image_create_solid_fill (pixman_color_t *color)
+pixman_image_create_solid_fill (const pixman_color_t *color)
 {
     pixman_image_t *img = _pixman_image_allocate ();
 
@@ -107,10 +60,7 @@ pixman_image_create_solid_fill (pixman_color_t *color)
     img->type = SOLID;
     img->solid.color = *color;
     img->solid.color_32 = color_to_uint32 (color);
-    img->solid.color_64 = color_to_uint64 (color);
-
-    img->common.classify = solid_fill_classify;
-    img->common.property_changed = solid_fill_property_changed;
+    img->solid.color_float = color_to_float (color);
 
     return img;
 }
diff --git a/programs/develop/libraries/pixman/pixman-sse2.c b/programs/develop/libraries/pixman/pixman-sse2.c
new file mode 100644
index 0000000000..863bc18ada
--- /dev/null
+++ b/programs/develop/libraries/pixman/pixman-sse2.c
@@ -0,0 +1,6449 @@
+/*
+ * Copyright © 2008 Rodrigo Kumpera
+ * Copyright © 2008 André Tupinambá
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Red Hat makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Rodrigo Kumpera (kumpera@gmail.com)
+ *          André Tupinambá (andrelrt@gmail.com)
+ *
+ * Based on work by Owen Taylor and Søren Sandmann
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
+#include <emmintrin.h> /* for SSE2 intrinsics */
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "pixman-inlines.h"
+
+static __m128i mask_0080;
+static __m128i mask_00ff;
+static __m128i mask_0101;
+static __m128i mask_ffff;
+static __m128i mask_ff000000;
+static __m128i mask_alpha;
+
+static __m128i mask_565_r;
+static __m128i mask_565_g1, mask_565_g2;
+static __m128i mask_565_b;
+static __m128i mask_red;
+static __m128i mask_green;
+static __m128i mask_blue;
+
+static __m128i mask_565_fix_rb;
+static __m128i mask_565_fix_g;
+
+static __m128i mask_565_rb;
+static __m128i mask_565_pack_multiplier;
+
+static force_inline __m128i
+unpack_32_1x128 (uint32_t data)
+{
+    return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
+}
+
+static force_inline void
+unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
+{
+    *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
+    *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
+}
+
+static force_inline __m128i
+unpack_565_to_8888 (__m128i lo)
+{
+    __m128i r, g, b, rb, t;
+
+    r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
+    g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
+    b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
+
+    rb = _mm_or_si128 (r, b);
+    t  = _mm_and_si128 (rb, mask_565_fix_rb);
+    t  = _mm_srli_epi32 (t, 5);
+    rb = _mm_or_si128 (rb, t);
+
+    t  = _mm_and_si128 (g, mask_565_fix_g);
+    t  = _mm_srli_epi32 (t, 6);
+    g  = _mm_or_si128 (g, t);
+
+    return _mm_or_si128 (rb, g);
+}
+
+static force_inline void
+unpack_565_128_4x128 (__m128i  data,
+                      __m128i* data0,
+                      __m128i* data1,
+                      __m128i* data2,
+                      __m128i* data3)
+{
+    __m128i lo, hi;
+
+    lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
+    hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
+
+    lo = unpack_565_to_8888 (lo);
+    hi = unpack_565_to_8888 (hi);
+
+    unpack_128_2x128 (lo, data0, data1);
+    unpack_128_2x128 (hi, data2, data3);
+}
+
+static force_inline uint16_t
+pack_565_32_16 (uint32_t pixel)
+{
+    return (uint16_t) (((pixel >> 8) & 0xf800) |
+		       ((pixel >> 5) & 0x07e0) |
+		       ((pixel >> 3) & 0x001f));
+}
+
+static force_inline __m128i
+pack_2x128_128 (__m128i lo, __m128i hi)
+{
+    return _mm_packus_epi16 (lo, hi);
+}
+
+static force_inline __m128i
+pack_565_2packedx128_128 (__m128i lo, __m128i hi)
+{
+    __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
+    __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
+
+    __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
+    __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
+
+    __m128i g0 = _mm_and_si128 (lo, mask_green);
+    __m128i g1 = _mm_and_si128 (hi, mask_green);
+
+    t0 = _mm_or_si128 (t0, g0);
+    t1 = _mm_or_si128 (t1, g1);
+
+    /* Simulates _mm_packus_epi32 */
+    t0 = _mm_slli_epi32 (t0, 16 - 5);
+    t1 = _mm_slli_epi32 (t1, 16 - 5);
+    t0 = _mm_srai_epi32 (t0, 16);
+    t1 = _mm_srai_epi32 (t1, 16);
+    return _mm_packs_epi32 (t0, t1);
+}
+
+static force_inline __m128i
+pack_565_2x128_128 (__m128i lo, __m128i hi)
+{
+    __m128i data;
+    __m128i r, g1, g2, b;
+
+    data = pack_2x128_128 (lo, hi);
+
+    r  = _mm_and_si128 (data, mask_565_r);
+    g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
+    g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
+    b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
+
+    return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
+}
+
+static force_inline __m128i
+pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
+{
+    return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
+			     pack_565_2x128_128 (*xmm2, *xmm3));
+}
+
+static force_inline int
+is_opaque (__m128i x)
+{
+    __m128i ffs = _mm_cmpeq_epi8 (x, x);
+
+    return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
+}
+
+static force_inline int
+is_zero (__m128i x)
+{
+    return _mm_movemask_epi8 (
+	_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
+}
+
+static force_inline int
+is_transparent (__m128i x)
+{
+    return (_mm_movemask_epi8 (
+		_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
+}
+
+static force_inline __m128i
+expand_pixel_32_1x128 (uint32_t data)
+{
+    return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
+}
+
+static force_inline __m128i
+expand_alpha_1x128 (__m128i data)
+{
+    return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
+						     _MM_SHUFFLE (3, 3, 3, 3)),
+				_MM_SHUFFLE (3, 3, 3, 3));
+}
+
+static force_inline void
+expand_alpha_2x128 (__m128i  data_lo,
+                    __m128i  data_hi,
+                    __m128i* alpha_lo,
+                    __m128i* alpha_hi)
+{
+    __m128i lo, hi;
+
+    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
+    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
+
+    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
+    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
+}
+
+static force_inline void
+expand_alpha_rev_2x128 (__m128i  data_lo,
+                        __m128i  data_hi,
+                        __m128i* alpha_lo,
+                        __m128i* alpha_hi)
+{
+    __m128i lo, hi;
+
+    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
+    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
+    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
+    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline void
+pix_multiply_2x128 (__m128i* data_lo,
+                    __m128i* data_hi,
+                    __m128i* alpha_lo,
+                    __m128i* alpha_hi,
+                    __m128i* ret_lo,
+                    __m128i* ret_hi)
+{
+    __m128i lo, hi;
+
+    lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
+    hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
+    lo = _mm_adds_epu16 (lo, mask_0080);
+    hi = _mm_adds_epu16 (hi, mask_0080);
+    *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
+    *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
+}
+
+static force_inline void
+pix_add_multiply_2x128 (__m128i* src_lo,
+                        __m128i* src_hi,
+                        __m128i* alpha_dst_lo,
+                        __m128i* alpha_dst_hi,
+                        __m128i* dst_lo,
+                        __m128i* dst_hi,
+                        __m128i* alpha_src_lo,
+                        __m128i* alpha_src_hi,
+                        __m128i* ret_lo,
+                        __m128i* ret_hi)
+{
+    __m128i t1_lo, t1_hi;
+    __m128i t2_lo, t2_hi;
+
+    pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
+    pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
+
+    *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
+    *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
+}
+
+static force_inline void
+negate_2x128 (__m128i  data_lo,
+              __m128i  data_hi,
+              __m128i* neg_lo,
+              __m128i* neg_hi)
+{
+    *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
+    *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
+}
+
+static force_inline void
+invert_colors_2x128 (__m128i  data_lo,
+                     __m128i  data_hi,
+                     __m128i* inv_lo,
+                     __m128i* inv_hi)
+{
+    __m128i lo, hi;
+
+    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
+    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
+    *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
+    *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
+}
+
+static force_inline void
+over_2x128 (__m128i* src_lo,
+            __m128i* src_hi,
+            __m128i* alpha_lo,
+            __m128i* alpha_hi,
+            __m128i* dst_lo,
+            __m128i* dst_hi)
+{
+    __m128i t1, t2;
+
+    negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
+
+    pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
+
+    *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
+    *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
+}
+
+static force_inline void
+over_rev_non_pre_2x128 (__m128i  src_lo,
+                        __m128i  src_hi,
+                        __m128i* dst_lo,
+                        __m128i* dst_hi)
+{
+    __m128i lo, hi;
+    __m128i alpha_lo, alpha_hi;
+
+    expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
+
+    lo = _mm_or_si128 (alpha_lo, mask_alpha);
+    hi = _mm_or_si128 (alpha_hi, mask_alpha);
+
+    invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
+
+    pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
+
+    over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
+}
+
+static force_inline void
+in_over_2x128 (__m128i* src_lo,
+               __m128i* src_hi,
+               __m128i* alpha_lo,
+               __m128i* alpha_hi,
+               __m128i* mask_lo,
+               __m128i* mask_hi,
+               __m128i* dst_lo,
+               __m128i* dst_hi)
+{
+    __m128i s_lo, s_hi;
+    __m128i a_lo, a_hi;
+
+    pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
+    pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
+
+    over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
+}
+
+/* load 4 pixels from a 16-byte boundary aligned address */
+static force_inline __m128i
+load_128_aligned (__m128i* src)
+{
+    return _mm_load_si128 (src);
+}
+
+/* load 4 pixels from a unaligned address */
+static force_inline __m128i
+load_128_unaligned (const __m128i* src)
+{
+    return _mm_loadu_si128 (src);
+}
+
+/* save 4 pixels using Write Combining memory on a 16-byte
+ * boundary aligned address
+ */
+static force_inline void
+save_128_write_combining (__m128i* dst,
+                          __m128i  data)
+{
+    _mm_stream_si128 (dst, data);
+}
+
+/* save 4 pixels on a 16-byte boundary aligned address */
+static force_inline void
+save_128_aligned (__m128i* dst,
+                  __m128i  data)
+{
+    _mm_store_si128 (dst, data);
+}
+
+/* save 4 pixels on a unaligned address */
+static force_inline void
+save_128_unaligned (__m128i* dst,
+                    __m128i  data)
+{
+    _mm_storeu_si128 (dst, data);
+}
+
+static force_inline __m128i
+load_32_1x128 (uint32_t data)
+{
+    return _mm_cvtsi32_si128 (data);
+}
+
+static force_inline __m128i
+expand_alpha_rev_1x128 (__m128i data)
+{
+    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline __m128i
+expand_pixel_8_1x128 (uint8_t data)
+{
+    return _mm_shufflelo_epi16 (
+	unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline __m128i
+pix_multiply_1x128 (__m128i data,
+		    __m128i alpha)
+{
+    return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
+					    mask_0080),
+			    mask_0101);
+}
+
+static force_inline __m128i
+pix_add_multiply_1x128 (__m128i* src,
+			__m128i* alpha_dst,
+			__m128i* dst,
+			__m128i* alpha_src)
+{
+    __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
+    __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
+
+    return _mm_adds_epu8 (t1, t2);
+}
+
+static force_inline __m128i
+negate_1x128 (__m128i data)
+{
+    return _mm_xor_si128 (data, mask_00ff);
+}
+
+static force_inline __m128i
+invert_colors_1x128 (__m128i data)
+{
+    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
+}
+
+static force_inline __m128i
+over_1x128 (__m128i src, __m128i alpha, __m128i dst)
+{
+    return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
+}
+
+static force_inline __m128i
+in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
+{
+    return over_1x128 (pix_multiply_1x128 (*src, *mask),
+		       pix_multiply_1x128 (*alpha, *mask),
+		       *dst);
+}
+
+static force_inline __m128i
+over_rev_non_pre_1x128 (__m128i src, __m128i dst)
+{
+    __m128i alpha = expand_alpha_1x128 (src);
+
+    return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
+					   _mm_or_si128 (alpha, mask_alpha)),
+		       alpha,
+		       dst);
+}
+
+static force_inline uint32_t
+pack_1x128_32 (__m128i data)
+{
+    return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
+}
+
+static force_inline __m128i
+expand565_16_1x128 (uint16_t pixel)
+{
+    __m128i m = _mm_cvtsi32_si128 (pixel);
+
+    m = unpack_565_to_8888 (m);
+
+    return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
+}
+
+static force_inline uint32_t
+core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
+{
+    uint8_t a;
+    __m128i xmms;
+
+    a = src >> 24;
+
+    if (a == 0xff)
+    {
+	return src;
+    }
+    else if (src)
+    {
+	xmms = unpack_32_1x128 (src);
+	return pack_1x128_32 (
+	    over_1x128 (xmms, expand_alpha_1x128 (xmms),
+			unpack_32_1x128 (dst)));
+    }
+
+    return dst;
+}
+
+static force_inline uint32_t
+combine1 (const uint32_t *ps, const uint32_t *pm)
+{
+    uint32_t s = *ps;
+
+    if (pm)
+    {
+	__m128i ms, mm;
+
+	mm = unpack_32_1x128 (*pm);
+	mm = expand_alpha_1x128 (mm);
+
+	ms = unpack_32_1x128 (s);
+	ms = pix_multiply_1x128 (ms, mm);
+
+	s = pack_1x128_32 (ms);
+    }
+
+    return s;
+}
+
+static force_inline __m128i
+combine4 (const __m128i *ps, const __m128i *pm)
+{
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_msk_lo, xmm_msk_hi;
+    __m128i s;
+
+    if (pm)
+    {
+	xmm_msk_lo = load_128_unaligned (pm);
+
+	if (is_transparent (xmm_msk_lo))
+	    return _mm_setzero_si128 ();
+    }
+
+    s = load_128_unaligned (ps);
+
+    if (pm)
+    {
+	unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
+
+	expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_msk_lo, &xmm_msk_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+
+	s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
+    }
+
+    return s;
+}
+
+static force_inline void
+core_combine_over_u_sse2_mask (uint32_t *	  pd,
+			       const uint32_t*    ps,
+			       const uint32_t*    pm,
+			       int                w)
+{
+    uint32_t s, d;
+
+    /* Align dst on a 16-byte boundary */
+    while (w && ((uintptr_t)pd & 15))
+    {
+	d = *pd;
+	s = combine1 (ps, pm);
+
+	if (s)
+	    *pd = core_combine_over_u_pixel_sse2 (s, d);
+	pd++;
+	ps++;
+	pm++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m128i mask = load_128_unaligned ((__m128i *)pm);
+
+	if (!is_zero (mask))
+	{
+	    __m128i src;
+	    __m128i src_hi, src_lo;
+	    __m128i mask_hi, mask_lo;
+	    __m128i alpha_hi, alpha_lo;
+
+	    src = load_128_unaligned ((__m128i *)ps);
+
+	    if (is_opaque (_mm_and_si128 (src, mask)))
+	    {
+		save_128_aligned ((__m128i *)pd, src);
+	    }
+	    else
+	    {
+		__m128i dst = load_128_aligned ((__m128i *)pd);
+		__m128i dst_hi, dst_lo;
+
+		unpack_128_2x128 (mask, &mask_lo, &mask_hi);
+		unpack_128_2x128 (src, &src_lo, &src_hi);
+
+		expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
+		pix_multiply_2x128 (&src_lo, &src_hi,
+				    &mask_lo, &mask_hi,
+				    &src_lo, &src_hi);
+
+		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
+
+		expand_alpha_2x128 (src_lo, src_hi,
+				    &alpha_lo, &alpha_hi);
+
+		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
+			    &dst_lo, &dst_hi);
+
+		save_128_aligned (
+		    (__m128i *)pd,
+		    pack_2x128_128 (dst_lo, dst_hi));
+	    }
+	}
+
+	pm += 4;
+	ps += 4;
+	pd += 4;
+	w -= 4;
+    }
+    while (w)
+    {
+	d = *pd;
+	s = combine1 (ps, pm);
+
+	if (s)
+	    *pd = core_combine_over_u_pixel_sse2 (s, d);
+	pd++;
+	ps++;
+	pm++;
+
+	w--;
+    }
+}
+
+static force_inline void
+core_combine_over_u_sse2_no_mask (uint32_t *	  pd,
+				  const uint32_t*    ps,
+				  int                w)
+{
+    uint32_t s, d;
+
+    /* Align dst on a 16-byte boundary */
+    while (w && ((uintptr_t)pd & 15))
+    {
+	d = *pd;
+	s = *ps;
+
+	if (s)
+	    *pd = core_combine_over_u_pixel_sse2 (s, d);
+	pd++;
+	ps++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m128i src;
+	__m128i src_hi, src_lo, dst_hi, dst_lo;
+	__m128i alpha_hi, alpha_lo;
+
+	src = load_128_unaligned ((__m128i *)ps);
+
+	if (!is_zero (src))
+	{
+	    if (is_opaque (src))
+	    {
+		save_128_aligned ((__m128i *)pd, src);
+	    }
+	    else
+	    {
+		__m128i dst = load_128_aligned ((__m128i *)pd);
+
+		unpack_128_2x128 (src, &src_lo, &src_hi);
+		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
+
+		expand_alpha_2x128 (src_lo, src_hi,
+				    &alpha_lo, &alpha_hi);
+		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
+			    &dst_lo, &dst_hi);
+
+		save_128_aligned (
+		    (__m128i *)pd,
+		    pack_2x128_128 (dst_lo, dst_hi));
+	    }
+	}
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+    }
+    while (w)
+    {
+	d = *pd;
+	s = *ps;
+
+	if (s)
+	    *pd = core_combine_over_u_pixel_sse2 (s, d);
+	pd++;
+	ps++;
+
+	w--;
+    }
+}
+
+static force_inline void
+sse2_combine_over_u (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    if (pm)
+	core_combine_over_u_sse2_mask (pd, ps, pm, w);
+    else
+	core_combine_over_u_sse2_no_mask (pd, ps, w);
+}
+
+static void
+sse2_combine_over_reverse_u (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               pd,
+                             const uint32_t *         ps,
+                             const uint32_t *         pm,
+                             int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    /* Align dst on a 16-byte boundary */
+    while (w &&
+           ((uintptr_t)pd & 15))
+    {
+	d = *pd;
+	s = combine1 (ps, pm);
+
+	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	/* I'm loading unaligned because I'm not sure
+	 * about the address alignment.
+	 */
+	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+
+	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+		    &xmm_alpha_lo, &xmm_alpha_hi,
+		    &xmm_src_lo, &xmm_src_hi);
+
+	/* rebuid the 4 pixel data and save*/
+	save_128_aligned ((__m128i*)pd,
+			  pack_2x128_128 (xmm_src_lo, xmm_src_hi));
+
+	w -= 4;
+	ps += 4;
+	pd += 4;
+
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	d = *pd;
+	s = combine1 (ps, pm);
+
+	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
+	ps++;
+	w--;
+	if (pm)
+	    pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
+{
+    uint32_t maska = src >> 24;
+
+    if (maska == 0)
+    {
+	return 0;
+    }
+    else if (maska != 0xff)
+    {
+	return pack_1x128_32 (
+	    pix_multiply_1x128 (unpack_32_1x128 (dst),
+				expand_alpha_1x128 (unpack_32_1x128 (src))));
+    }
+
+    return dst;
+}
+
+static void
+sse2_combine_in_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               pd,
+                   const uint32_t *         ps,
+                   const uint32_t *         pm,
+                   int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+
+    while (w && ((uintptr_t)pd & 15))
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned ((__m128i*)pd,
+			  pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+}
+
+static void
+sse2_combine_in_reverse_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               pd,
+                           const uint32_t *         ps,
+                           const uint32_t *         pm,
+                           int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+
+    while (w && ((uintptr_t)pd & 15))
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
+	ps++;
+	w--;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_src_lo, &xmm_src_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+}
+
+static void
+sse2_combine_out_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               pd,
+                            const uint32_t *         ps,
+                            const uint32_t *         pm,
+                            int                      w)
+{
+    while (w && ((uintptr_t)pd & 15))
+    {
+	uint32_t s = combine1 (ps, pm);
+	uint32_t d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d), negate_1x128 (
+		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
+
+	if (pm)
+	    pm++;
+	ps++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m128i xmm_src_lo, xmm_src_hi;
+	__m128i xmm_dst_lo, xmm_dst_hi;
+
+	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_src_lo, &xmm_src_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	if (pm)
+	    pm += 4;
+
+	w -= 4;
+    }
+
+    while (w)
+    {
+	uint32_t s = combine1 (ps, pm);
+	uint32_t d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d), negate_1x128 (
+		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
+	ps++;
+	if (pm)
+	    pm++;
+	w--;
+    }
+}
+
+static void
+sse2_combine_out_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               pd,
+                    const uint32_t *         ps,
+                    const uint32_t *         pm,
+                    int                      w)
+{
+    while (w && ((uintptr_t)pd & 15))
+    {
+	uint32_t s = combine1 (ps, pm);
+	uint32_t d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (s), negate_1x128 (
+		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	__m128i xmm_src_lo, xmm_src_hi;
+	__m128i xmm_dst_lo, xmm_dst_hi;
+
+	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	uint32_t s = combine1 (ps, pm);
+	uint32_t d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (s), negate_1x128 (
+		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_atop_u_pixel_sse2 (uint32_t src,
+                                uint32_t dst)
+{
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
+    __m128i da = expand_alpha_1x128 (d);
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
+}
+
+static void
+sse2_combine_atop_u (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+
+    while (w && ((uintptr_t)pd & 15))
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
+		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
+                                        uint32_t dst)
+{
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i sa = expand_alpha_1x128 (s);
+    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
+}
+
+static void
+sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               pd,
+                             const uint32_t *         ps,
+                             const uint32_t *         pm,
+                             int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+
+    while (w && ((uintptr_t)pd & 15))
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
+	ps++;
+	w--;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
+	ps++;
+	w--;
+	if (pm)
+	    pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_xor_u_pixel_sse2 (uint32_t src,
+                               uint32_t dst)
+{
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
+    __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
+}
+
+static void
+sse2_combine_xor_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dst,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    int w = width;
+    uint32_t s, d;
+    uint32_t* pd = dst;
+    const uint32_t* ps = src;
+    const uint32_t* pm = mask;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+
+    while (w && ((uintptr_t)pd & 15))
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
+	xmm_dst = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
+		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+}
+
+static force_inline void
+sse2_combine_add_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dst,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    int w = width;
+    uint32_t s, d;
+    uint32_t* pd = dst;
+    const uint32_t* ps = src;
+    const uint32_t* pm = mask;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	ps++;
+	if (pm)
+	    pm++;
+	*pd++ = _mm_cvtsi128_si32 (
+	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m128i s;
+
+	s = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+	save_128_aligned (
+	    (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
+
+	pd += 4;
+	ps += 4;
+	if (pm)
+	    pm += 4;
+	w -= 4;
+    }
+
+    while (w--)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	ps++;
+	*pd++ = _mm_cvtsi128_si32 (
+	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
+	if (pm)
+	    pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_saturate_u_pixel_sse2 (uint32_t src,
+                                    uint32_t dst)
+{
+    __m128i ms = unpack_32_1x128 (src);
+    __m128i md = unpack_32_1x128 (dst);
+    uint32_t sa = src >> 24;
+    uint32_t da = ~dst >> 24;
+
+    if (sa > da)
+    {
+	ms = pix_multiply_1x128 (
+	    ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
+    }
+
+    return pack_1x128_32 (_mm_adds_epu16 (md, ms));
+}
+
+static void
+sse2_combine_saturate_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         uint32_t *               pd,
+                         const uint32_t *         ps,
+                         const uint32_t *         pm,
+                         int                      w)
+{
+    uint32_t s, d;
+
+    uint32_t pack_cmp;
+    __m128i xmm_src, xmm_dst;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst = load_128_aligned  ((__m128i*)pd);
+	xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+	pack_cmp = _mm_movemask_epi8 (
+	    _mm_cmpgt_epi32 (
+		_mm_srli_epi32 (xmm_src, 24),
+		_mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
+
+	/* if some alpha src is grater than respective ~alpha dst */
+	if (pack_cmp)
+	{
+	    s = combine1 (ps++, pm);
+	    d = *pd;
+	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	    if (pm)
+		pm++;
+
+	    s = combine1 (ps++, pm);
+	    d = *pd;
+	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	    if (pm)
+		pm++;
+
+	    s = combine1 (ps++, pm);
+	    d = *pd;
+	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	    if (pm)
+		pm++;
+
+	    s = combine1 (ps++, pm);
+	    d = *pd;
+	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	    if (pm)
+		pm++;
+	}
+	else
+	{
+	    save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
+
+	    pd += 4;
+	    ps += 4;
+	    if (pm)
+		pm += 4;
+	}
+
+	w -= 4;
+    }
+
+    while (w--)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	ps++;
+	if (pm)
+	    pm++;
+    }
+}
+
+static void
+sse2_combine_src_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, m;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
+	w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_over_ca_pixel_sse2 (uint32_t src,
+                                 uint32_t mask,
+                                 uint32_t dst)
+{
+    __m128i s = unpack_32_1x128 (src);
+    __m128i expAlpha = expand_alpha_1x128 (s);
+    __m128i unpk_mask = unpack_32_1x128 (mask);
+    __m128i unpk_dst  = unpack_32_1x128 (dst);
+
+    return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
+}
+
+static void
+sse2_combine_over_ca (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      uint32_t *               pd,
+                      const uint32_t *         ps,
+                      const uint32_t *         pm,
+                      int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+
+	in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+		       &xmm_alpha_lo, &xmm_alpha_hi,
+		       &xmm_mask_lo, &xmm_mask_hi,
+		       &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
+                                         uint32_t mask,
+                                         uint32_t dst)
+{
+    __m128i d = unpack_32_1x128 (dst);
+
+    return pack_1x128_32 (
+	over_1x128 (d, expand_alpha_1x128 (d),
+		    pix_multiply_1x128 (unpack_32_1x128 (src),
+					unpack_32_1x128 (mask))));
+}
+
+static void
+sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              uint32_t *               pd,
+                              const uint32_t *         ps,
+                              const uint32_t *         pm,
+                              int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
+
+	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+		    &xmm_alpha_lo, &xmm_alpha_hi,
+		    &xmm_mask_lo, &xmm_mask_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+}
+
+static void
+sse2_combine_in_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               pd,
+                    const uint32_t *         ps,
+                    const uint32_t *         pm,
+                    int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
+		expand_alpha_1x128 (unpack_32_1x128 (d))));
+
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		pix_multiply_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
+		expand_alpha_1x128 (unpack_32_1x128 (d))));
+
+	w--;
+    }
+}
+
+static void
+sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               pd,
+                            const uint32_t *         ps,
+                            const uint32_t *         pm,
+                            int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d),
+		pix_multiply_1x128 (unpack_32_1x128 (m),
+				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d),
+		pix_multiply_1x128 (unpack_32_1x128 (m),
+				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
+	w--;
+    }
+}
+
+static void
+sse2_combine_out_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		pix_multiply_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
+		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+	negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
+		      &xmm_alpha_lo, &xmm_alpha_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		pix_multiply_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
+		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
+
+	w--;
+    }
+}
+
+static void
+sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               pd,
+                             const uint32_t *         ps,
+                             const uint32_t *         pm,
+                             int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d),
+		negate_1x128 (pix_multiply_1x128 (
+				 unpack_32_1x128 (m),
+				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
+
+	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
+		      &xmm_mask_lo, &xmm_mask_hi);
+
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d),
+		negate_1x128 (pix_multiply_1x128 (
+				 unpack_32_1x128 (m),
+				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
+	w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_atop_ca_pixel_sse2 (uint32_t src,
+                                 uint32_t mask,
+                                 uint32_t dst)
+{
+    __m128i m = unpack_32_1x128 (mask);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+    __m128i sa = expand_alpha_1x128 (s);
+    __m128i da = expand_alpha_1x128 (d);
+
+    s = pix_multiply_1x128 (s, m);
+    m = negate_1x128 (pix_multiply_1x128 (m, sa));
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
+}
+
+static void
+sse2_combine_atop_ca (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      uint32_t *               pd,
+                      const uint32_t *         ps,
+                      const uint32_t *         pm,
+                      int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
+
+	negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
+                                         uint32_t mask,
+                                         uint32_t dst)
+{
+    __m128i m = unpack_32_1x128 (mask);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
+    __m128i sa = expand_alpha_1x128 (s);
+
+    s = pix_multiply_1x128 (s, m);
+    m = pix_multiply_1x128 (m, sa);
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
+}
+
+static void
+sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              uint32_t *               pd,
+                              const uint32_t *         ps,
+                              const uint32_t *         pm,
+                              int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
+
+	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_xor_ca_pixel_sse2 (uint32_t src,
+                                uint32_t mask,
+                                uint32_t dst)
+{
+    __m128i a = unpack_32_1x128 (mask);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
+				       a, expand_alpha_1x128 (s)));
+    __m128i dest      = pix_multiply_1x128 (s, a);
+    __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&d,
+                                                &alpha_dst,
+                                                &dest,
+                                                &alpha_src));
+}
+
+static void
+sse2_combine_xor_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
+
+	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
+		      &xmm_mask_lo, &xmm_mask_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+}
+
+static void
+sse2_combine_add_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+					       unpack_32_1x128 (m)),
+			   unpack_32_1x128 (d)));
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (
+		_mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
+		_mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+					       unpack_32_1x128 (m)),
+			   unpack_32_1x128 (d)));
+	w--;
+    }
+}
+
+static force_inline __m128i
+create_mask_16_128 (uint16_t mask)
+{
+    return _mm_set1_epi16 (mask);
+}
+
+/* Work around a code generation bug in Sun Studio 12. */
+#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
+# define create_mask_2x32_128(mask0, mask1)				\
+    (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
+#else
+static force_inline __m128i
+create_mask_2x32_128 (uint32_t mask0,
+                      uint32_t mask1)
+{
+    return _mm_set_epi32 (mask0, mask1, mask0, mask1);
+}
+#endif
+
+static void
+sse2_composite_over_n_8888 (pixman_implementation_t *imp,
+                            pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, *dst, d;
+    int32_t w;
+    int dst_stride;
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+
+    while (height--)
+    {
+	dst = dst_line;
+
+	dst_line += dst_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    d = *dst;
+	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
+						xmm_alpha,
+						unpack_32_1x128 (d)));
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	    over_2x128 (&xmm_src, &xmm_src,
+			&xmm_alpha, &xmm_alpha,
+			&xmm_dst_lo, &xmm_dst_hi);
+
+	    /* rebuid the 4 pixel data and save*/
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    w -= 4;
+	    dst += 4;
+	}
+
+	while (w)
+	{
+	    d = *dst;
+	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
+						xmm_alpha,
+						unpack_32_1x128 (d)));
+	    w--;
+	}
+
+    }
+}
+
+static void
+sse2_composite_over_n_0565 (pixman_implementation_t *imp,
+                            pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint16_t    *dst_line, *dst, d;
+    int32_t w;
+    int dst_stride;
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+
+    while (height--)
+    {
+	dst = dst_line;
+
+	dst_line += dst_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    d = *dst;
+
+	    *dst++ = pack_565_32_16 (
+		pack_1x128_32 (over_1x128 (xmm_src,
+					   xmm_alpha,
+					   expand565_16_1x128 (d))));
+	    w--;
+	}
+
+	while (w >= 8)
+	{
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+	    over_2x128 (&xmm_src, &xmm_src,
+			&xmm_alpha, &xmm_alpha,
+			&xmm_dst0, &xmm_dst1);
+	    over_2x128 (&xmm_src, &xmm_src,
+			&xmm_alpha, &xmm_alpha,
+			&xmm_dst2, &xmm_dst3);
+
+	    xmm_dst = pack_565_4x128_128 (
+		&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+	    save_128_aligned ((__m128i*)dst, xmm_dst);
+
+	    dst += 8;
+	    w -= 8;
+	}
+
+	while (w--)
+	{
+	    d = *dst;
+	    *dst++ = pack_565_32_16 (
+		pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
+					   expand565_16_1x128 (d))));
+	}
+    }
+
+}
+
+static void
+sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
+				   pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, d;
+    uint32_t    *mask_line, m;
+    uint32_t pack_cmp;
+    int dst_stride, mask_stride;
+
+    __m128i xmm_src;
+    __m128i xmm_dst;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    __m128i mmx_src, mmx_mask, mmx_dest;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    xmm_src = _mm_unpacklo_epi8 (
+	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
+    mmx_src   = xmm_src;
+
+    while (height--)
+    {
+	int w = width;
+	const uint32_t *pm = (uint32_t *)mask_line;
+	uint32_t *pd = (uint32_t *)dst_line;
+
+	dst_line += dst_stride;
+	mask_line += mask_stride;
+
+	while (w && (uintptr_t)pd & 15)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
+
+		*pd = pack_1x128_32 (
+		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
+				   mmx_dest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    xmm_mask = load_128_unaligned ((__m128i*)pm);
+
+	    pack_cmp =
+		_mm_movemask_epi8 (
+		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
+	    if (pack_cmp != 0xffff)
+	    {
+		xmm_dst = load_128_aligned ((__m128i*)pd);
+
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		pix_multiply_2x128 (&xmm_src, &xmm_src,
+				    &xmm_mask_lo, &xmm_mask_hi,
+				    &xmm_mask_lo, &xmm_mask_hi);
+		xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
+
+		save_128_aligned (
+		    (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
+	    }
+
+	    pd += 4;
+	    pm += 4;
+	    w -= 4;
+	}
+
+	while (w)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
+
+		*pd = pack_1x128_32 (
+		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
+				   mmx_dest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+                                    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, d;
+    uint32_t    *mask_line, m;
+    uint32_t pack_cmp;
+    int dst_stride, mask_stride;
+
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    xmm_src = _mm_unpacklo_epi8 (
+	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src   = xmm_src;
+    mmx_alpha = xmm_alpha;
+
+    while (height--)
+    {
+	int w = width;
+	const uint32_t *pm = (uint32_t *)mask_line;
+	uint32_t *pd = (uint32_t *)dst_line;
+
+	dst_line += dst_stride;
+	mask_line += mask_stride;
+
+	while (w && (uintptr_t)pd & 15)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
+
+		*pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
+		                                  &mmx_alpha,
+		                                  &mmx_mask,
+		                                  &mmx_dest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    xmm_mask = load_128_unaligned ((__m128i*)pm);
+
+	    pack_cmp =
+		_mm_movemask_epi8 (
+		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
+	    if (pack_cmp != 0xffff)
+	    {
+		xmm_dst = load_128_aligned ((__m128i*)pd);
+
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+
+	    pd += 4;
+	    pm += 4;
+	    w -= 4;
+	}
+
+	while (w)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
+
+		*pd = pack_1x128_32 (
+		    in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    uint32_t mask;
+    int32_t w;
+    int dst_stride, src_stride;
+
+    __m128i xmm_mask;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
+
+    xmm_mask = create_mask_16_128 (mask >> 24);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    uint32_t s = *src++;
+
+	    if (s)
+	    {
+		uint32_t d = *dst;
+		
+		__m128i ms = unpack_32_1x128 (s);
+		__m128i alpha    = expand_alpha_1x128 (ms);
+		__m128i dest     = xmm_mask;
+		__m128i alpha_dst = unpack_32_1x128 (d);
+		
+		*dst = pack_1x128_32 (
+		    in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+	    }
+	    dst++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    xmm_src = load_128_unaligned ((__m128i*)src);
+
+	    if (!is_zero (xmm_src))
+	    {
+		xmm_dst = load_128_aligned ((__m128i*)dst);
+		
+		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+				    &xmm_alpha_lo, &xmm_alpha_hi);
+		
+		in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			       &xmm_alpha_lo, &xmm_alpha_hi,
+			       &xmm_mask, &xmm_mask,
+			       &xmm_dst_lo, &xmm_dst_hi);
+		
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+		
+	    dst += 4;
+	    src += 4;
+	    w -= 4;
+	}
+
+	while (w)
+	{
+	    uint32_t s = *src++;
+
+	    if (s)
+	    {
+		uint32_t d = *dst;
+		
+		__m128i ms = unpack_32_1x128 (s);
+		__m128i alpha = expand_alpha_1x128 (ms);
+		__m128i mask  = xmm_mask;
+		__m128i dest  = unpack_32_1x128 (d);
+		
+		*dst = pack_1x128_32 (
+		    in_over_1x128 (&ms, &alpha, &mask, &dest));
+	    }
+
+	    dst++;
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    s = *src++;
+	    *dst = convert_8888_to_0565 (s);
+	    dst++;
+	    w--;
+	}
+
+	while (w >= 8)
+	{
+	    __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);
+	    __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);
+
+	    save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));
+
+	    w -= 8;
+	    src += 8;
+	    dst += 8;
+	}
+
+	while (w)
+	{
+	    s = *src++;
+	    *dst = convert_8888_to_0565 (s);
+	    dst++;
+	    w--;
+	}
+    }
+}
+
+static void
+sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
+			      pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int32_t w;
+    int dst_stride, src_stride;
+
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    *dst++ = *src++ | 0xff000000;
+	    w--;
+	}
+
+	while (w >= 16)
+	{
+	    __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
+	    
+	    xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
+	    xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
+	    xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
+	    xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
+	    
+	    save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
+	    save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
+	    save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
+	    save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
+	    
+	    dst += 16;
+	    src += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    *dst++ = *src++ | 0xff000000;
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    uint32_t mask;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    __m128i xmm_mask, xmm_alpha;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
+
+    xmm_mask = create_mask_16_128 (mask >> 24);
+    xmm_alpha = mask_00ff;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    uint32_t s = (*src++) | 0xff000000;
+	    uint32_t d = *dst;
+
+	    __m128i src   = unpack_32_1x128 (s);
+	    __m128i alpha = xmm_alpha;
+	    __m128i mask  = xmm_mask;
+	    __m128i dest  = unpack_32_1x128 (d);
+
+	    *dst++ = pack_1x128_32 (
+		in_over_1x128 (&src, &alpha, &mask, &dest));
+
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    xmm_src = _mm_or_si128 (
+		load_128_unaligned ((__m128i*)src), mask_ff000000);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			   &xmm_alpha, &xmm_alpha,
+			   &xmm_mask, &xmm_mask,
+			   &xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    dst += 4;
+	    src += 4;
+	    w -= 4;
+
+	}
+
+	while (w)
+	{
+	    uint32_t s = (*src++) | 0xff000000;
+	    uint32_t d = *dst;
+
+	    __m128i src  = unpack_32_1x128 (s);
+	    __m128i alpha = xmm_alpha;
+	    __m128i mask  = xmm_mask;
+	    __m128i dest  = unpack_32_1x128 (d);
+
+	    *dst++ = pack_1x128_32 (
+		in_over_1x128 (&src, &alpha, &mask, &dest));
+
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
+                               pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    int dst_stride, src_stride;
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    dst = dst_line;
+    src = src_line;
+
+    while (height--)
+    {
+	sse2_combine_over_u (imp, op, dst, src, NULL, width);
+
+	dst += dst_stride;
+	src += src_stride;
+    }
+}
+
+static force_inline uint16_t
+composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
+{
+    __m128i ms;
+
+    ms = unpack_32_1x128 (src);
+    return pack_565_32_16 (
+	pack_1x128_32 (
+	    over_1x128 (
+		ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
+}
+
+static void
+sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
+                               pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst, d;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	src = src_line;
+
+	dst_line += dst_stride;
+	src_line += src_stride;
+	w = width;
+
+	/* Align dst on a 16-byte boundary */
+	while (w &&
+	       ((uintptr_t)dst & 15))
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    *dst++ = composite_over_8888_0565pixel (s, d);
+	    w--;
+	}
+
+	/* It's a 8 pixel loop */
+	while (w >= 8)
+	{
+	    /* I'm loading unaligned because I'm not sure
+	     * about the address alignment.
+	     */
+	    xmm_src = load_128_unaligned ((__m128i*) src);
+	    xmm_dst = load_128_aligned ((__m128i*) dst);
+
+	    /* Unpacking */
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+				&xmm_alpha_lo, &xmm_alpha_hi);
+
+	    /* I'm loading next 4 pixels from memory
+	     * before to optimze the memory read.
+	     */
+	    xmm_src = load_128_unaligned ((__m128i*) (src + 4));
+
+	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			&xmm_alpha_lo, &xmm_alpha_hi,
+			&xmm_dst0, &xmm_dst1);
+
+	    /* Unpacking */
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+				&xmm_alpha_lo, &xmm_alpha_hi);
+
+	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			&xmm_alpha_lo, &xmm_alpha_hi,
+			&xmm_dst2, &xmm_dst3);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_565_4x128_128 (
+		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+	    w -= 8;
+	    dst += 8;
+	    src += 8;
+	}
+
+	while (w--)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    *dst++ = composite_over_8888_0565pixel (s, d);
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint32_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t m, d;
+
+    __m128i xmm_src, xmm_alpha, xmm_def;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    xmm_def = create_mask_2x32_128 (src, src);
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src   = xmm_src;
+    mmx_alpha = xmm_alpha;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    uint8_t m = *mask++;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = expand_pixel_8_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
+
+		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
+		                                   &mmx_alpha,
+		                                   &mmx_mask,
+		                                   &mmx_dest));
+	    }
+
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 4)
+	{
+	    m = *((uint32_t*)mask);
+
+	    if (srca == 0xff && m == 0xffffffff)
+	    {
+		save_128_aligned ((__m128i*)dst, xmm_def);
+	    }
+	    else if (m)
+	    {
+		xmm_dst = load_128_aligned ((__m128i*) dst);
+		xmm_mask = unpack_32_1x128 (m);
+		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+		/* Unpacking */
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+
+	    w -= 4;
+	    dst += 4;
+	    mask += 4;
+	}
+
+	while (w)
+	{
+	    uint8_t m = *mask++;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = expand_pixel_8_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
+
+		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
+		                                   &mmx_alpha,
+		                                   &mmx_mask,
+		                                   &mmx_dest));
+	    }
+
+	    w--;
+	    dst++;
+	}
+    }
+
+}
+
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+static pixman_bool_t
+sse2_fill (pixman_implementation_t *imp,
+           uint32_t *               bits,
+           int                      stride,
+           int                      bpp,
+           int                      x,
+           int                      y,
+           int                      width,
+           int                      height,
+           uint32_t		    filler)
+{
+    uint32_t byte_width;
+    uint8_t *byte_line;
+
+    __m128i xmm_def;
+
+    if (bpp == 8)
+    {
+	uint8_t b;
+	uint16_t w;
+
+	stride = stride * (int) sizeof (uint32_t) / 1;
+	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
+	byte_width = width;
+	stride *= 1;
+
+	b = filler & 0xff;
+	w = (b << 8) | b;
+	filler = (w << 16) | w;
+    }
+    else if (bpp == 16)
+    {
+	stride = stride * (int) sizeof (uint32_t) / 2;
+	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+	byte_width = 2 * width;
+	stride *= 2;
+
+        filler = (filler & 0xffff) * 0x00010001;
+    }
+    else if (bpp == 32)
+    {
+	stride = stride * (int) sizeof (uint32_t) / 4;
+	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+	byte_width = 4 * width;
+	stride *= 4;
+    }
+    else
+    {
+	return FALSE;
+    }
+
+    xmm_def = create_mask_2x32_128 (filler, filler);
+
+    while (height--)
+    {
+	int w;
+	uint8_t *d = byte_line;
+	byte_line += stride;
+	w = byte_width;
+
+	if (w >= 1 && ((uintptr_t)d & 1))
+	{
+	    *(uint8_t *)d = filler;
+	    w -= 1;
+	    d += 1;
+	}
+
+	while (w >= 2 && ((uintptr_t)d & 3))
+	{
+	    *(uint16_t *)d = filler;
+	    w -= 2;
+	    d += 2;
+	}
+
+	while (w >= 4 && ((uintptr_t)d & 15))
+	{
+	    *(uint32_t *)d = filler;
+
+	    w -= 4;
+	    d += 4;
+	}
+
+	while (w >= 128)
+	{
+	    save_128_aligned ((__m128i*)(d),     xmm_def);
+	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 64),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 80),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 96),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 112), xmm_def);
+
+	    d += 128;
+	    w -= 128;
+	}
+
+	if (w >= 64)
+	{
+	    save_128_aligned ((__m128i*)(d),     xmm_def);
+	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
+
+	    d += 64;
+	    w -= 64;
+	}
+
+	if (w >= 32)
+	{
+	    save_128_aligned ((__m128i*)(d),     xmm_def);
+	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
+
+	    d += 32;
+	    w -= 32;
+	}
+
+	if (w >= 16)
+	{
+	    save_128_aligned ((__m128i*)(d),     xmm_def);
+
+	    d += 16;
+	    w -= 16;
+	}
+
+	while (w >= 4)
+	{
+	    *(uint32_t *)d = filler;
+
+	    w -= 4;
+	    d += 4;
+	}
+
+	if (w >= 2)
+	{
+	    *(uint16_t *)d = filler;
+	    w -= 2;
+	    d += 2;
+	}
+
+	if (w >= 1)
+	{
+	    *(uint8_t *)d = filler;
+	    w -= 1;
+	    d += 1;
+	}
+    }
+
+    return TRUE;
+}
+
+static void
+sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
+                             pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint32_t    *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t m;
+
+    __m128i xmm_src, xmm_def;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+    {
+	sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
+		   PIXMAN_FORMAT_BPP (dest_image->bits.format),
+		   dest_x, dest_y, width, height, 0);
+	return;
+    }
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    xmm_def = create_mask_2x32_128 (src, src);
+    xmm_src = expand_pixel_32_1x128 (src);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    uint8_t m = *mask++;
+
+	    if (m)
+	    {
+		*dst = pack_1x128_32 (
+		    pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
+	    }
+	    else
+	    {
+		*dst = 0;
+	    }
+
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 4)
+	{
+	    m = *((uint32_t*)mask);
+
+	    if (srca == 0xff && m == 0xffffffff)
+	    {
+		save_128_aligned ((__m128i*)dst, xmm_def);
+	    }
+	    else if (m)
+	    {
+		xmm_mask = unpack_32_1x128 (m);
+		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+		/* Unpacking */
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+
+		pix_multiply_2x128 (&xmm_src, &xmm_src,
+				    &xmm_mask_lo, &xmm_mask_hi,
+				    &xmm_mask_lo, &xmm_mask_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
+	    }
+	    else
+	    {
+		save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
+	    }
+
+	    w -= 4;
+	    dst += 4;
+	    mask += 4;
+	}
+
+	while (w)
+	{
+	    uint8_t m = *mask++;
+
+	    if (m)
+	    {
+		*dst = pack_1x128_32 (
+		    pix_multiply_1x128 (
+			xmm_src, expand_pixel_8_1x128 (m)));
+	    }
+	    else
+	    {
+		*dst = 0;
+	    }
+
+	    w--;
+	    dst++;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint16_t    *dst_line, *dst, d;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t m;
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src = xmm_src;
+    mmx_alpha = xmm_alpha;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    m = *mask++;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+		mmx_dest = expand565_16_1x128 (d);
+
+		*dst = pack_565_32_16 (
+		    pack_1x128_32 (
+			in_over_1x128 (
+			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+	    }
+
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 8)
+	{
+	    xmm_dst = load_128_aligned ((__m128i*) dst);
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+	    m = *((uint32_t*)mask);
+	    mask += 4;
+
+	    if (m)
+	    {
+		xmm_mask = unpack_32_1x128 (m);
+		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+		/* Unpacking */
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst0, &xmm_dst1);
+	    }
+
+	    m = *((uint32_t*)mask);
+	    mask += 4;
+
+	    if (m)
+	    {
+		xmm_mask = unpack_32_1x128 (m);
+		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+		/* Unpacking */
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst2, &xmm_dst3);
+	    }
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_565_4x128_128 (
+		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+	    w -= 8;
+	    dst += 8;
+	}
+
+	while (w)
+	{
+	    m = *mask++;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+		mmx_dest = expand565_16_1x128 (d);
+
+		*dst = pack_565_32_16 (
+		    pack_1x128_32 (
+			in_over_1x128 (
+			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+	    }
+
+	    w--;
+	    dst++;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst, d;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint32_t opaque, zero;
+
+    __m128i ms;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    ms = unpack_32_1x128 (s);
+
+	    *dst++ = pack_565_32_16 (
+		pack_1x128_32 (
+		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
+	    w--;
+	}
+
+	while (w >= 8)
+	{
+	    /* First round */
+	    xmm_src = load_128_unaligned ((__m128i*)src);
+	    xmm_dst = load_128_aligned  ((__m128i*)dst);
+
+	    opaque = is_opaque (xmm_src);
+	    zero = is_zero (xmm_src);
+
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+
+	    /* preload next round*/
+	    xmm_src = load_128_unaligned ((__m128i*)(src + 4));
+
+	    if (opaque)
+	    {
+		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+				     &xmm_dst0, &xmm_dst1);
+	    }
+	    else if (!zero)
+	    {
+		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+					&xmm_dst0, &xmm_dst1);
+	    }
+
+	    /* Second round */
+	    opaque = is_opaque (xmm_src);
+	    zero = is_zero (xmm_src);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+
+	    if (opaque)
+	    {
+		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+				     &xmm_dst2, &xmm_dst3);
+	    }
+	    else if (!zero)
+	    {
+		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+					&xmm_dst2, &xmm_dst3);
+	    }
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_565_4x128_128 (
+		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+	    w -= 8;
+	    src += 8;
+	    dst += 8;
+	}
+
+	while (w)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    ms = unpack_32_1x128 (s);
+
+	    *dst++ = pack_565_32_16 (
+		pack_1x128_32 (
+		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst, d;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint32_t opaque, zero;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    *dst++ = pack_1x128_32 (
+		over_rev_non_pre_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
+
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    xmm_src_hi = load_128_unaligned ((__m128i*)src);
+
+	    opaque = is_opaque (xmm_src_hi);
+	    zero = is_zero (xmm_src_hi);
+
+	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+	    if (opaque)
+	    {
+		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+				     &xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+	    else if (!zero)
+	    {
+		xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
+
+		unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+					&xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+
+	    w -= 4;
+	    dst += 4;
+	    src += 4;
+	}
+
+	while (w)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    *dst++ = pack_1x128_32 (
+		over_rev_non_pre_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
+
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+                                    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint16_t    *dst_line, *dst, d;
+    uint32_t    *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int w;
+    uint32_t pack_cmp;
+
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src = xmm_src;
+    mmx_alpha = xmm_alpha;
+
+    while (height--)
+    {
+	w = width;
+	mask = mask_line;
+	dst = dst_line;
+	mask_line += mask_stride;
+	dst_line += dst_stride;
+
+	while (w && ((uintptr_t)dst & 15))
+	{
+	    m = *(uint32_t *) mask;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = expand565_16_1x128 (d);
+
+		*dst = pack_565_32_16 (
+		    pack_1x128_32 (
+			in_over_1x128 (
+			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+	    }
+
+	    w--;
+	    dst++;
+	    mask++;
+	}
+
+	while (w >= 8)
+	{
+	    /* First round */
+	    xmm_mask = load_128_unaligned ((__m128i*)mask);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    pack_cmp = _mm_movemask_epi8 (
+		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+	    /* preload next round */
+	    xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
+
+	    /* preload next round */
+	    if (pack_cmp != 0xffff)
+	    {
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst0, &xmm_dst1);
+	    }
+
+	    /* Second round */
+	    pack_cmp = _mm_movemask_epi8 (
+		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+	    if (pack_cmp != 0xffff)
+	    {
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst2, &xmm_dst3);
+	    }
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_565_4x128_128 (
+		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+	    w -= 8;
+	    dst += 8;
+	    mask += 8;
+	}
+
+	while (w)
+	{
+	    m = *(uint32_t *) mask;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = expand565_16_1x128 (d);
+
+		*dst = pack_565_32_16 (
+		    pack_1x128_32 (
+			in_over_1x128 (
+			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+	    }
+
+	    w--;
+	    dst++;
+	    mask++;
+	}
+    }
+
+}
+
+static void
+sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
+                         pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint32_t d, m;
+    uint32_t src;
+    int32_t w;
+
+    __m128i xmm_alpha;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && ((uintptr_t)dst & 15))
+	{
+	    m = (uint32_t) *mask++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    pix_multiply_1x128 (xmm_alpha,
+				       unpack_32_1x128 (m)),
+		    unpack_32_1x128 (d)));
+	    w--;
+	}
+
+	while (w >= 16)
+	{
+	    xmm_mask = load_128_unaligned ((__m128i*)mask);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+				&xmm_mask_lo, &xmm_mask_hi,
+				&xmm_mask_lo, &xmm_mask_hi);
+
+	    pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+				&xmm_dst_lo, &xmm_dst_hi,
+				&xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    mask += 16;
+	    dst += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    m = (uint32_t) *mask++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    pix_multiply_1x128 (
+			xmm_alpha, unpack_32_1x128 (m)),
+		    unpack_32_1x128 (d)));
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_in_n_8 (pixman_implementation_t *imp,
+		       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    int dst_stride;
+    uint32_t d;
+    uint32_t src;
+    int32_t w;
+
+    __m128i xmm_alpha;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
+
+    src = src >> 24;
+
+    if (src == 0xff)
+	return;
+
+    if (src == 0x00)
+    {
+	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
+		     8, dest_x, dest_y, width, height, src);
+
+	return;
+    }
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	w = width;
+
+	while (w && ((uintptr_t)dst & 15))
+	{
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    xmm_alpha,
+		    unpack_32_1x128 (d)));
+	    w--;
+	}
+
+	while (w >= 16)
+	{
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+	    
+	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+				&xmm_dst_lo, &xmm_dst_hi,
+				&xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    dst += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    xmm_alpha,
+		    unpack_32_1x128 (d)));
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_in_8_8 (pixman_implementation_t *imp,
+                       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int src_stride, dst_stride;
+    int32_t w;
+    uint32_t s, d;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && ((uintptr_t)dst & 15))
+	{
+	    s = (uint32_t) *src++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
+	    w--;
+	}
+
+	while (w >= 16)
+	{
+	    xmm_src = load_128_unaligned ((__m128i*)src);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	    pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+				&xmm_dst_lo, &xmm_dst_hi,
+				&xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    src += 16;
+	    dst += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    s = (uint32_t) *src++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
+			  pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t src;
+    uint32_t m, d;
+
+    __m128i xmm_alpha;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && ((uintptr_t)dst & 15))
+	{
+	    m = (uint32_t) *mask++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		_mm_adds_epu16 (
+		    pix_multiply_1x128 (
+			xmm_alpha, unpack_32_1x128 (m)),
+		    unpack_32_1x128 (d)));
+	    w--;
+	}
+
+	while (w >= 16)
+	{
+	    xmm_mask = load_128_unaligned ((__m128i*)mask);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+				&xmm_mask_lo, &xmm_mask_hi,
+				&xmm_mask_lo, &xmm_mask_hi);
+
+	    xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
+	    xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    mask += 16;
+	    dst += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    m = (uint32_t) *mask++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		_mm_adds_epu16 (
+		    pix_multiply_1x128 (
+			xmm_alpha, unpack_32_1x128 (m)),
+		    unpack_32_1x128 (d)));
+
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_add_n_8 (pixman_implementation_t *imp,
+			pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    int dst_stride;
+    int32_t w;
+    uint32_t src;
+
+    __m128i xmm_src;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    src >>= 24;
+
+    if (src == 0x00)
+	return;
+
+    if (src == 0xff)
+    {
+	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
+		     8, dest_x, dest_y, width, height, 0xff);
+
+	return;
+    }
+
+    src = (src << 24) | (src << 16) | (src << 8) | src;
+    xmm_src = _mm_set_epi32 (src, src, src, src);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	w = width;
+
+	while (w && ((uintptr_t)dst & 15))
+	{
+	    *dst = (uint8_t)_mm_cvtsi128_si32 (
+		_mm_adds_epu8 (
+		    xmm_src,
+		    _mm_cvtsi32_si128 (*dst)));
+
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 16)
+	{
+	    save_128_aligned (
+		(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
+
+	    dst += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    *dst = (uint8_t)_mm_cvtsi128_si32 (
+		_mm_adds_epu8 (
+		    xmm_src,
+		    _mm_cvtsi32_si128 (*dst)));
+
+	    w--;
+	    dst++;
+	}
+    }
+
+}
+
+static void
+sse2_composite_add_8_8 (pixman_implementation_t *imp,
+			pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint16_t t;
+
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	src = src_line;
+
+	dst_line += dst_stride;
+	src_line += src_stride;
+	w = width;
+
+	/* Small head */
+	while (w && (uintptr_t)dst & 3)
+	{
+	    t = (*dst) + (*src++);
+	    *dst++ = t | (0 - (t >> 8));
+	    w--;
+	}
+
+	sse2_combine_add_u (imp, op,
+			    (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
+
+	/* Small tail */
+	dst += w & 0xfffc;
+	src += w & 0xfffc;
+
+	w &= 3;
+
+	while (w)
+	{
+	    t = (*dst) + (*src++);
+	    *dst++ = t | (0 - (t >> 8));
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+
+	sse2_combine_add_u (imp, op, dst, src, NULL, width);
+    }
+}
+
+static void
+sse2_composite_add_n_8888 (pixman_implementation_t *imp,
+			   pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *dst_line, *dst, src;
+    int dst_stride;
+
+    __m128i xmm_src;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    if (src == 0)
+	return;
+
+    if (src == ~0)
+    {
+	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
+		     dest_x, dest_y, width, height, ~0);
+
+	return;
+    }
+
+    xmm_src = _mm_set_epi32 (src, src, src, src);
+    while (height--)
+    {
+	int w = width;
+	uint32_t d;
+
+	dst = dst_line;
+	dst_line += dst_stride;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    d = *dst;
+	    *dst++ =
+		_mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    save_128_aligned
+		((__m128i*)dst,
+		 _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
+
+	    dst += 4;
+	    w -= 4;
+	}
+
+	while (w--)
+	{
+	    d = *dst;
+	    *dst++ =
+		_mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
+						  _mm_cvtsi32_si128 (d)));
+	}
+    }
+}
+
+static void
+sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
+			     pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t src;
+
+    __m128i xmm_src;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    if (src == 0)
+	return;
+    xmm_src = expand_pixel_32_1x128 (src);
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && ((uintptr_t)dst & 15))
+	{
+	    uint8_t m = *mask++;
+	    if (m)
+	    {
+		*dst = pack_1x128_32
+		    (_mm_adds_epu16
+		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
+		      unpack_32_1x128 (*dst)));
+	    }
+	    dst++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    uint32_t m = *(uint32_t*)mask;
+	    if (m)
+	    {
+		__m128i xmm_mask_lo, xmm_mask_hi;
+		__m128i xmm_dst_lo, xmm_dst_hi;
+
+		__m128i xmm_dst = load_128_aligned ((__m128i*)dst);
+		__m128i xmm_mask =
+		    _mm_unpacklo_epi8 (unpack_32_1x128(m),
+				       _mm_setzero_si128 ());
+
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+
+		pix_multiply_2x128 (&xmm_src, &xmm_src,
+				    &xmm_mask_lo, &xmm_mask_hi,
+				    &xmm_mask_lo, &xmm_mask_hi);
+
+		xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
+		xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+
+	    w -= 4;
+	    dst += 4;
+	    mask += 4;
+	}
+
+	while (w)
+	{
+	    uint8_t m = *mask++;
+	    if (m)
+	    {
+		*dst = pack_1x128_32
+		    (_mm_adds_epu16
+		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
+		      unpack_32_1x128 (*dst)));
+	    }
+	    dst++;
+	    w--;
+	}
+    }
+}
+
+static pixman_bool_t
+sse2_blt (pixman_implementation_t *imp,
+          uint32_t *               src_bits,
+          uint32_t *               dst_bits,
+          int                      src_stride,
+          int                      dst_stride,
+          int                      src_bpp,
+          int                      dst_bpp,
+          int                      src_x,
+          int                      src_y,
+          int                      dest_x,
+          int                      dest_y,
+          int                      width,
+          int                      height)
+{
+    uint8_t *   src_bytes;
+    uint8_t *   dst_bytes;
+    int byte_width;
+
+    if (src_bpp != dst_bpp)
+	return FALSE;
+
+    if (src_bpp == 16)
+    {
+	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
+	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
+	src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
+	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
+	byte_width = 2 * width;
+	src_stride *= 2;
+	dst_stride *= 2;
+    }
+    else if (src_bpp == 32)
+    {
+	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
+	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
+	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
+	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
+	byte_width = 4 * width;
+	src_stride *= 4;
+	dst_stride *= 4;
+    }
+    else
+    {
+	return FALSE;
+    }
+
+    while (height--)
+    {
+	int w;
+	uint8_t *s = src_bytes;
+	uint8_t *d = dst_bytes;
+	src_bytes += src_stride;
+	dst_bytes += dst_stride;
+	w = byte_width;
+
+	while (w >= 2 && ((uintptr_t)d & 3))
+	{
+	    *(uint16_t *)d = *(uint16_t *)s;
+	    w -= 2;
+	    s += 2;
+	    d += 2;
+	}
+
+	while (w >= 4 && ((uintptr_t)d & 15))
+	{
+	    *(uint32_t *)d = *(uint32_t *)s;
+
+	    w -= 4;
+	    s += 4;
+	    d += 4;
+	}
+
+	while (w >= 64)
+	{
+	    __m128i xmm0, xmm1, xmm2, xmm3;
+
+	    xmm0 = load_128_unaligned ((__m128i*)(s));
+	    xmm1 = load_128_unaligned ((__m128i*)(s + 16));
+	    xmm2 = load_128_unaligned ((__m128i*)(s + 32));
+	    xmm3 = load_128_unaligned ((__m128i*)(s + 48));
+
+	    save_128_aligned ((__m128i*)(d),    xmm0);
+	    save_128_aligned ((__m128i*)(d + 16), xmm1);
+	    save_128_aligned ((__m128i*)(d + 32), xmm2);
+	    save_128_aligned ((__m128i*)(d + 48), xmm3);
+
+	    s += 64;
+	    d += 64;
+	    w -= 64;
+	}
+
+	while (w >= 16)
+	{
+	    save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
+
+	    w -= 16;
+	    d += 16;
+	    s += 16;
+	}
+
+	while (w >= 4)
+	{
+	    *(uint32_t *)d = *(uint32_t *)s;
+
+	    w -= 4;
+	    s += 4;
+	    d += 4;
+	}
+
+	if (w >= 2)
+	{
+	    *(uint16_t *)d = *(uint16_t *)s;
+	    w -= 2;
+	    s += 2;
+	    d += 2;
+	}
+    }
+
+    return TRUE;
+}
+
+static void
+sse2_composite_copy_area (pixman_implementation_t *imp,
+                          pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    sse2_blt (imp, src_image->bits.bits,
+	      dest_image->bits.bits,
+	      src_image->bits.rowstride,
+	      dest_image->bits.rowstride,
+	      PIXMAN_FORMAT_BPP (src_image->bits.format),
+	      PIXMAN_FORMAT_BPP (dest_image->bits.format),
+	      src_x, src_y, dest_x, dest_y, width, height);
+}
+
+static void
+sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *src, *src_line, s;
+    uint32_t    *dst, *dst_line, d;
+    uint8_t         *mask, *mask_line;
+    uint32_t m;
+    int src_stride, mask_stride, dst_stride;
+    int32_t w;
+    __m128i ms;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+        src = src_line;
+        src_line += src_stride;
+        dst = dst_line;
+        dst_line += dst_stride;
+        mask = mask_line;
+        mask_line += mask_stride;
+
+        w = width;
+
+        while (w && (uintptr_t)dst & 15)
+        {
+            s = 0xff000000 | *src++;
+            m = (uint32_t) *mask++;
+            d = *dst;
+            ms = unpack_32_1x128 (s);
+
+            if (m != 0xff)
+            {
+		__m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+		__m128i md = unpack_32_1x128 (d);
+
+                ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
+            }
+
+            *dst++ = pack_1x128_32 (ms);
+            w--;
+        }
+
+        while (w >= 4)
+        {
+            m = *(uint32_t*) mask;
+            xmm_src = _mm_or_si128 (
+		load_128_unaligned ((__m128i*)src), mask_ff000000);
+
+            if (m == 0xffffffff)
+            {
+                save_128_aligned ((__m128i*)dst, xmm_src);
+            }
+            else
+            {
+                xmm_dst = load_128_aligned ((__m128i*)dst);
+
+                xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+                unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+                unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+                unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+                expand_alpha_rev_2x128 (
+		    xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+                in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			       &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst_lo, &xmm_dst_hi);
+
+                save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+            }
+
+            src += 4;
+            dst += 4;
+            mask += 4;
+            w -= 4;
+        }
+
+        while (w)
+        {
+            m = (uint32_t) *mask++;
+
+            if (m)
+            {
+                s = 0xff000000 | *src;
+
+                if (m == 0xff)
+                {
+                    *dst = s;
+                }
+                else
+                {
+		    __m128i ma, md, ms;
+
+                    d = *dst;
+
+		    ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+		    md = unpack_32_1x128 (d);
+		    ms = unpack_32_1x128 (s);
+
+                    *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
+                }
+
+            }
+
+            src++;
+            dst++;
+            w--;
+        }
+    }
+
+}
+
+static void
+sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *src, *src_line, s;
+    uint32_t    *dst, *dst_line, d;
+    uint8_t         *mask, *mask_line;
+    uint32_t m;
+    int src_stride, mask_stride, dst_stride;
+    int32_t w;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+        src = src_line;
+        src_line += src_stride;
+        dst = dst_line;
+        dst_line += dst_stride;
+        mask = mask_line;
+        mask_line += mask_stride;
+
+        w = width;
+
+        while (w && (uintptr_t)dst & 15)
+        {
+	    uint32_t sa;
+
+            s = *src++;
+            m = (uint32_t) *mask++;
+            d = *dst;
+
+	    sa = s >> 24;
+
+	    if (m)
+	    {
+		if (sa == 0xff && m == 0xff)
+		{
+		    *dst = s;
+		}
+		else
+		{
+		    __m128i ms, md, ma, msa;
+
+		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		    ms = unpack_32_1x128 (s);
+		    md = unpack_32_1x128 (d);
+
+		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+		}
+	    }
+
+	    dst++;
+            w--;
+        }
+
+        while (w >= 4)
+        {
+            m = *(uint32_t *) mask;
+
+	    if (m)
+	    {
+		xmm_src = load_128_unaligned ((__m128i*)src);
+
+		if (m == 0xffffffff && is_opaque (xmm_src))
+		{
+		    save_128_aligned ((__m128i *)dst, xmm_src);
+		}
+		else
+		{
+		    xmm_dst = load_128_aligned ((__m128i *)dst);
+
+		    xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+		    expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+		}
+	    }
+
+            src += 4;
+            dst += 4;
+            mask += 4;
+            w -= 4;
+        }
+
+        while (w)
+        {
+	    uint32_t sa;
+
+            s = *src++;
+            m = (uint32_t) *mask++;
+            d = *dst;
+
+	    sa = s >> 24;
+
+	    if (m)
+	    {
+		if (sa == 0xff && m == 0xff)
+		{
+		    *dst = s;
+		}
+		else
+		{
+		    __m128i ms, md, ma, msa;
+
+		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		    ms = unpack_32_1x128 (s);
+		    md = unpack_32_1x128 (d);
+
+		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+		}
+	    }
+
+	    dst++;
+            w--;
+        }
+    }
+
+}
+
+static void
+sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
+				    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, *dst;
+    __m128i xmm_src;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_dsta_hi, xmm_dsta_lo;
+    int dst_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+
+    while (height--)
+    {
+	dst = dst_line;
+
+	dst_line += dst_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    __m128i vd;
+
+	    vd = unpack_32_1x128 (*dst);
+
+	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+					      xmm_src));
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 4)
+	{
+	    __m128i tmp_lo, tmp_hi;
+
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+	    expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
+
+	    tmp_lo = xmm_src;
+	    tmp_hi = xmm_src;
+
+	    over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			&xmm_dsta_lo, &xmm_dsta_hi,
+			&tmp_lo, &tmp_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
+
+	    w -= 4;
+	    dst += 4;
+	}
+
+	while (w)
+	{
+	    __m128i vd;
+
+	    vd = unpack_32_1x128 (*dst);
+
+	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+					      xmm_src));
+	    w--;
+	    dst++;
+	}
+
+    }
+
+}
+
+static void
+sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
+				    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *src, *src_line, s;
+    uint32_t    *dst, *dst_line, d;
+    uint32_t    *mask, *mask_line;
+    uint32_t    m;
+    int src_stride, mask_stride, dst_stride;
+    int32_t w;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+        src = src_line;
+        src_line += src_stride;
+        dst = dst_line;
+        dst_line += dst_stride;
+        mask = mask_line;
+        mask_line += mask_stride;
+
+        w = width;
+
+        while (w && (uintptr_t)dst & 15)
+        {
+	    uint32_t sa;
+
+            s = *src++;
+            m = (*mask++) >> 24;
+            d = *dst;
+
+	    sa = s >> 24;
+
+	    if (m)
+	    {
+		if (sa == 0xff && m == 0xff)
+		{
+		    *dst = s;
+		}
+		else
+		{
+		    __m128i ms, md, ma, msa;
+
+		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		    ms = unpack_32_1x128 (s);
+		    md = unpack_32_1x128 (d);
+
+		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+		}
+	    }
+
+	    dst++;
+            w--;
+        }
+
+        while (w >= 4)
+        {
+	    xmm_mask = load_128_unaligned ((__m128i*)mask);
+
+	    if (!is_transparent (xmm_mask))
+	    {
+		xmm_src = load_128_unaligned ((__m128i*)src);
+
+		if (is_opaque (xmm_mask) && is_opaque (xmm_src))
+		{
+		    save_128_aligned ((__m128i *)dst, xmm_src);
+		}
+		else
+		{
+		    xmm_dst = load_128_aligned ((__m128i *)dst);
+
+		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+		    expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+		}
+	    }
+
+            src += 4;
+            dst += 4;
+            mask += 4;
+            w -= 4;
+        }
+
+        while (w)
+        {
+	    uint32_t sa;
+
+            s = *src++;
+            m = (*mask++) >> 24;
+            d = *dst;
+
+	    sa = s >> 24;
+
+	    if (m)
+	    {
+		if (sa == 0xff && m == 0xff)
+		{
+		    *dst = s;
+		}
+		else
+		{
+		    __m128i ms, md, ma, msa;
+
+		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		    ms = unpack_32_1x128 (s);
+		    md = unpack_32_1x128 (d);
+
+		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+		}
+	    }
+
+	    dst++;
+            w--;
+        }
+    }
+
+}
+
+/* A variant of 'sse2_combine_over_u' with minor tweaks */
+static force_inline void
+scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
+                                             const uint32_t* ps,
+                                             int32_t         w,
+                                             pixman_fixed_t  vx,
+                                             pixman_fixed_t  unit_x,
+                                             pixman_fixed_t  src_width_fixed,
+                                             pixman_bool_t   fully_transparent_src)
+{
+    uint32_t s, d;
+    const uint32_t* pm = NULL;
+
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    if (fully_transparent_src)
+	return;
+
+    /* Align dst on a 16-byte boundary */
+    while (w && ((uintptr_t)pd & 15))
+    {
+	d = *pd;
+	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+
+	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
+	if (pm)
+	    pm++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m128i tmp;
+	uint32_t tmp1, tmp2, tmp3, tmp4;
+
+	tmp1 = *(ps + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp2 = *(ps + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp3 = *(ps + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp4 = *(ps + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+
+	tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
+
+	xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
+
+	if (is_opaque (xmm_src_hi))
+	{
+	    save_128_aligned ((__m128i*)pd, xmm_src_hi);
+	}
+	else if (!is_zero (xmm_src_hi))
+	{
+	    xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	    expand_alpha_2x128 (
+		xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+
+	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			&xmm_alpha_lo, &xmm_alpha_hi,
+			&xmm_dst_lo, &xmm_dst_hi);
+
+	    /* rebuid the 4 pixel data and save*/
+	    save_128_aligned ((__m128i*)pd,
+			      pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	}
+
+	w -= 4;
+	pd += 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	d = *pd;
+	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+
+	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
+	if (pm)
+	    pm++;
+
+	w--;
+    }
+}
+
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
+		       scaled_nearest_scanline_sse2_8888_8888_OVER,
+		       uint32_t, uint32_t, COVER)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
+		       scaled_nearest_scanline_sse2_8888_8888_OVER,
+		       uint32_t, uint32_t, NONE)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
+		       scaled_nearest_scanline_sse2_8888_8888_OVER,
+		       uint32_t, uint32_t, PAD)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
+		       scaled_nearest_scanline_sse2_8888_8888_OVER,
+		       uint32_t, uint32_t, NORMAL)
+
+static force_inline void
+scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
+					       uint32_t *       dst,
+					       const uint32_t * src,
+					       int32_t          w,
+					       pixman_fixed_t   vx,
+					       pixman_fixed_t   unit_x,
+					       pixman_fixed_t   src_width_fixed,
+					       pixman_bool_t    zero_src)
+{
+    __m128i xmm_mask;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    if (zero_src || (*mask >> 24) == 0)
+	return;
+
+    xmm_mask = create_mask_16_128 (*mask >> 24);
+
+    while (w && (uintptr_t)dst & 15)
+    {
+	uint32_t s = *(src + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+
+	if (s)
+	{
+	    uint32_t d = *dst;
+
+	    __m128i ms = unpack_32_1x128 (s);
+	    __m128i alpha     = expand_alpha_1x128 (ms);
+	    __m128i dest      = xmm_mask;
+	    __m128i alpha_dst = unpack_32_1x128 (d);
+
+	    *dst = pack_1x128_32 (
+		in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+	}
+	dst++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	uint32_t tmp1, tmp2, tmp3, tmp4;
+
+	tmp1 = *(src + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp2 = *(src + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp3 = *(src + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp4 = *(src + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+
+	xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
+
+	if (!is_zero (xmm_src))
+	{
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			        &xmm_alpha_lo, &xmm_alpha_hi);
+
+	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			   &xmm_alpha_lo, &xmm_alpha_hi,
+			   &xmm_mask, &xmm_mask,
+			   &xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	}
+
+	dst += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	uint32_t s = *(src + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+
+	if (s)
+	{
+	    uint32_t d = *dst;
+
+	    __m128i ms = unpack_32_1x128 (s);
+	    __m128i alpha = expand_alpha_1x128 (ms);
+	    __m128i mask  = xmm_mask;
+	    __m128i dest  = unpack_32_1x128 (d);
+
+	    *dst = pack_1x128_32 (
+		in_over_1x128 (&ms, &alpha, &mask, &dest));
+	}
+
+	dst++;
+	w--;
+    }
+
+}
+
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
+			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
+			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
+			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
+			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
+
+#if BILINEAR_INTERPOLATION_BITS < 8
+# define BILINEAR_DECLARE_VARIABLES						\
+    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
+    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
+    const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
+    const __m128i xmm_ux = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,	\
+					  unit_x, -unit_x, unit_x, -unit_x);	\
+    const __m128i xmm_zero = _mm_setzero_si128 ();				\
+    __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1),		\
+				   vx, -(vx + 1), vx, -(vx + 1))
+#else
+# define BILINEAR_DECLARE_VARIABLES						\
+    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
+    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
+    const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);		\
+    const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x,	\
+					  -unit_x, -unit_x, -unit_x, -unit_x);	\
+    const __m128i xmm_zero = _mm_setzero_si128 ();				\
+    __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx,				\
+				   -(vx + 1), -(vx + 1), -(vx + 1), -(vx + 1))
+#endif
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
+do {										\
+    __m128i xmm_wh, xmm_lo, xmm_hi, a;						\
+    /* fetch 2x2 pixel block into sse2 registers */				\
+    __m128i tltr = _mm_loadl_epi64 (						\
+			    (__m128i *)&src_top[pixman_fixed_to_int (vx)]);	\
+    __m128i blbr = _mm_loadl_epi64 (						\
+			    (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]);	\
+    vx += unit_x;								\
+    /* vertical interpolation */						\
+    a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero),	\
+					xmm_wt),				\
+		       _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero),	\
+					xmm_wb));				\
+    if (BILINEAR_INTERPOLATION_BITS < 8)					\
+    {										\
+	/* calculate horizontal weights */					\
+	xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,		\
+					16 - BILINEAR_INTERPOLATION_BITS));	\
+	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
+	/* horizontal interpolation */						\
+	a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (		\
+		a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh);			\
+    }										\
+    else									\
+    {										\
+	/* calculate horizontal weights */					\
+	xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,		\
+					16 - BILINEAR_INTERPOLATION_BITS));	\
+	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
+	/* horizontal interpolation */						\
+	xmm_lo = _mm_mullo_epi16 (a, xmm_wh);					\
+	xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);					\
+	a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),			\
+			   _mm_unpackhi_epi16 (xmm_lo, xmm_hi));		\
+    }										\
+    /* shift and pack the result */						\
+    a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2);			\
+    a = _mm_packs_epi32 (a, a);							\
+    a = _mm_packus_epi16 (a, a);						\
+    pix = _mm_cvtsi128_si32 (a);						\
+} while (0)
+
+#define BILINEAR_SKIP_ONE_PIXEL()						\
+do {										\
+    vx += unit_x;								\
+    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
+} while(0)
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
+					     const uint32_t * mask,
+					     const uint32_t * src_top,
+					     const uint32_t * src_bottom,
+					     int32_t          w,
+					     int              wt,
+					     int              wb,
+					     pixman_fixed_t   vx,
+					     pixman_fixed_t   unit_x,
+					     pixman_fixed_t   max_vx,
+					     pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2, pix3, pix4;
+
+    while ((w -= 4) >= 0)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+	*dst++ = pix1;
+	*dst++ = pix2;
+	*dst++ = pix3;
+	*dst++ = pix4;
+    }
+
+    if (w & 2)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+	*dst++ = pix1;
+	*dst++ = pix2;
+    }
+
+    if (w & 1)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	*dst = pix1;
+    }
+
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
+			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
+			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
+			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
+			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
+					      const uint32_t * mask,
+					      const uint32_t * src_top,
+					      const uint32_t * src_bottom,
+					      int32_t          w,
+					      int              wt,
+					      int              wb,
+					      pixman_fixed_t   vx,
+					      pixman_fixed_t   unit_x,
+					      pixman_fixed_t   max_vx,
+					      pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2, pix3, pix4;
+
+    while (w && ((uintptr_t)dst & 15))
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+	if (pix1)
+	{
+	    pix2 = *dst;
+	    *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
+	}
+
+	w--;
+	dst++;
+    }
+
+    while (w  >= 4)
+    {
+	__m128i xmm_src;
+	__m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
+	__m128i xmm_alpha_hi, xmm_alpha_lo;
+
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+
+	xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+
+	if (!is_zero (xmm_src))
+	{
+	    if (is_opaque (xmm_src))
+	    {
+		save_128_aligned ((__m128i *)dst, xmm_src);
+	    }
+	    else
+	    {
+		__m128i xmm_dst = load_128_aligned ((__m128i *)dst);
+
+		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+		over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+	}
+
+	w -= 4;
+	dst += 4;
+    }
+
+    while (w)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+	if (pix1)
+	{
+	    pix2 = *dst;
+	    *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
+	}
+
+	w--;
+	dst++;
+    }
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
+			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
+			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
+			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
+			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
+						const uint8_t  * mask,
+						const uint32_t * src_top,
+						const uint32_t * src_bottom,
+						int32_t          w,
+						int              wt,
+						int              wb,
+						pixman_fixed_t   vx,
+						pixman_fixed_t   unit_x,
+						pixman_fixed_t   max_vx,
+						pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2, pix3, pix4;
+    uint32_t m;
+
+    while (w && ((uintptr_t)dst & 15))
+    {
+	uint32_t sa;
+
+	m = (uint32_t) *mask++;
+
+	if (m)
+	{
+	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	    sa = pix1 >> 24;
+
+	    if (sa == 0xff && m == 0xff)
+	    {
+		*dst = pix1;
+	    }
+	    else
+	    {
+		__m128i ms, md, ma, msa;
+
+		pix2 = *dst;
+		ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		ms = unpack_32_1x128 (pix1);
+		md = unpack_32_1x128 (pix2);
+
+		msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+		*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+	    }
+	}
+	else
+	{
+	    BILINEAR_SKIP_ONE_PIXEL ();
+	}
+
+	w--;
+	dst++;
+    }
+
+    while (w >= 4)
+    {
+	__m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+	m = *(uint32_t*)mask;
+
+	if (m)
+	{
+	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+
+	    xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+
+	    if (m == 0xffffffff && is_opaque (xmm_src))
+	    {
+		save_128_aligned ((__m128i *)dst, xmm_src);
+	    }
+	    else
+	    {
+		xmm_dst = load_128_aligned ((__m128i *)dst);
+
+		xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+		in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+			       &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+	}
+	else
+	{
+	    BILINEAR_SKIP_ONE_PIXEL ();
+	    BILINEAR_SKIP_ONE_PIXEL ();
+	    BILINEAR_SKIP_ONE_PIXEL ();
+	    BILINEAR_SKIP_ONE_PIXEL ();
+	}
+
+	w -= 4;
+	dst += 4;
+	mask += 4;
+    }
+
+    while (w)
+    {
+	uint32_t sa;
+
+	m = (uint32_t) *mask++;
+
+	if (m)
+	{
+	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	    sa = pix1 >> 24;
+
+	    if (sa == 0xff && m == 0xff)
+	    {
+		*dst = pix1;
+	    }
+	    else
+	    {
+		__m128i ms, md, ma, msa;
+
+		pix2 = *dst;
+		ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		ms = unpack_32_1x128 (pix1);
+		md = unpack_32_1x128 (pix2);
+
+		msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+		*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+	    }
+	}
+	else
+	{
+	    BILINEAR_SKIP_ONE_PIXEL ();
+	}
+
+	w--;
+	dst++;
+    }
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
+			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       COVER, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
+			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       PAD, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
+			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       NONE, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
+			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t *       dst,
+						const uint32_t * mask,
+						const uint32_t * src_top,
+						const uint32_t * src_bottom,
+						int32_t          w,
+						int              wt,
+						int              wb,
+						pixman_fixed_t   vx,
+						pixman_fixed_t   unit_x,
+						pixman_fixed_t   max_vx,
+						pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2, pix3, pix4;
+    __m128i xmm_mask;
+
+    if (zero_src || (*mask >> 24) == 0)
+	return;
+
+    xmm_mask = create_mask_16_128 (*mask >> 24);
+
+    while (w && ((uintptr_t)dst & 15))
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	if (pix1)
+	{
+		uint32_t d = *dst;
+
+		__m128i ms = unpack_32_1x128 (pix1);
+		__m128i alpha     = expand_alpha_1x128 (ms);
+		__m128i dest      = xmm_mask;
+		__m128i alpha_dst = unpack_32_1x128 (d);
+
+		*dst = pack_1x128_32
+			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+	}
+
+	dst++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+
+	if (pix1 | pix2 | pix3 | pix4)
+	{
+	    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+	    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+	    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+	    xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+				&xmm_alpha_lo, &xmm_alpha_hi);
+
+	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			   &xmm_alpha_lo, &xmm_alpha_hi,
+			   &xmm_mask, &xmm_mask,
+			   &xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned
+		((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	}
+
+	dst += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	if (pix1)
+	{
+		uint32_t d = *dst;
+
+		__m128i ms = unpack_32_1x128 (pix1);
+		__m128i alpha     = expand_alpha_1x128 (ms);
+		__m128i dest      = xmm_mask;
+		__m128i alpha_dst = unpack_32_1x128 (d);
+
+		*dst = pack_1x128_32
+			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+	}
+
+	dst++;
+	w--;
+    }
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
+			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FLAG_HAVE_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
+			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FLAG_HAVE_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
+			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       NONE, FLAG_HAVE_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
+			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       NORMAL, FLAG_HAVE_SOLID_MASK)
+
+static const pixman_fast_path_t sse2_fast_paths[] =
+{
+    /* PIXMAN_OP_OVER */
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
+    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+    
+    /* PIXMAN_OP_OVER_REVERSE */
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
+
+    /* PIXMAN_OP_ADD */
+    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
+    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
+    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
+    PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),
+
+    /* PIXMAN_OP_SRC */
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
+
+    /* PIXMAN_OP_IN */
+    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
+    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
+    PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
+
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
+
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
+
+    { PIXMAN_OP_NONE },
+};
+
+static uint32_t *
+sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    __m128i ff000000 = mask_ff000000;
+    uint32_t *dst = iter->buffer;
+    uint32_t *src = (uint32_t *)iter->bits;
+
+    iter->bits += iter->stride;
+
+    while (w && ((uintptr_t)dst) & 0x0f)
+    {
+	*dst++ = (*src++) | 0xff000000;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	save_128_aligned (
+	    (__m128i *)dst, _mm_or_si128 (
+		load_128_unaligned ((__m128i *)src), ff000000));
+
+	dst += 4;
+	src += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	*dst++ = (*src++) | 0xff000000;
+	w--;
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
+sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint16_t *src = (uint16_t *)iter->bits;
+    __m128i ff000000 = mask_ff000000;
+
+    iter->bits += iter->stride;
+
+    while (w && ((uintptr_t)dst) & 0x0f)
+    {
+	uint16_t s = *src++;
+
+	*dst++ = convert_0565_to_8888 (s);
+	w--;
+    }
+
+    while (w >= 8)
+    {
+	__m128i lo, hi, s;
+
+	s = _mm_loadu_si128 ((__m128i *)src);
+
+	lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
+	hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
+
+	save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
+	save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
+
+	dst += 8;
+	src += 8;
+	w -= 8;
+    }
+
+    while (w)
+    {
+	uint16_t s = *src++;
+
+	*dst++ = convert_0565_to_8888 (s);
+	w--;
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
+sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint8_t *src = iter->bits;
+    __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
+
+    iter->bits += iter->stride;
+
+    while (w && (((uintptr_t)dst) & 15))
+    {
+        *dst++ = *(src++) << 24;
+        w--;
+    }
+
+    while (w >= 16)
+    {
+	xmm0 = _mm_loadu_si128((__m128i *)src);
+
+	xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
+	xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
+	xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
+	xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
+	xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
+	xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
+
+	_mm_store_si128(((__m128i *)(dst +  0)), xmm3);
+	_mm_store_si128(((__m128i *)(dst +  4)), xmm4);
+	_mm_store_si128(((__m128i *)(dst +  8)), xmm5);
+	_mm_store_si128(((__m128i *)(dst + 12)), xmm6);
+
+	dst += 16;
+	src += 16;
+	w -= 16;
+    }
+
+    while (w)
+    {
+	*dst++ = *(src++) << 24;
+	w--;
+    }
+
+    return iter->buffer;
+}
+
+typedef struct
+{
+    pixman_format_code_t	format;
+    pixman_iter_get_scanline_t	get_scanline;
+} fetcher_info_t;
+
+static const fetcher_info_t fetchers[] =
+{
+    { PIXMAN_x8r8g8b8,		sse2_fetch_x8r8g8b8 },
+    { PIXMAN_r5g6b5,		sse2_fetch_r5g6b5 },
+    { PIXMAN_a8,		sse2_fetch_a8 },
+    { PIXMAN_null }
+};
+
+static pixman_bool_t
+sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+
+#define FLAGS								\
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
+     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
+
+    if ((iter->iter_flags & ITER_NARROW)			&&
+	(iter->image_flags & FLAGS) == FLAGS)
+    {
+	const fetcher_info_t *f;
+
+	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
+	{
+	    if (image->common.extended_format_code == f->format)
+	    {
+		uint8_t *b = (uint8_t *)image->bits.bits;
+		int s = image->bits.rowstride * 4;
+
+		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
+		iter->stride = s;
+
+		iter->get_scanline = f->get_scanline;
+		return TRUE;
+	    }
+	}
+    }
+
+    return FALSE;
+}
+
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+pixman_implementation_t *
+_pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
+
+    /* SSE2 constants */
+    mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
+    mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
+    mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
+    mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
+    mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
+    mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
+    mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
+    mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
+    mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
+    mask_0080 = create_mask_16_128 (0x0080);
+    mask_00ff = create_mask_16_128 (0x00ff);
+    mask_0101 = create_mask_16_128 (0x0101);
+    mask_ffff = create_mask_16_128 (0xffff);
+    mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
+    mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
+    mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8);
+    mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004);
+
+    /* Set up function pointers */
+    imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
+    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
+    imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
+    imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
+    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
+    imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
+
+    imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
+
+    imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
+    imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
+
+    imp->blt = sse2_blt;
+    imp->fill = sse2_fill;
+
+    imp->src_iter_init = sse2_src_iter_init;
+
+    return imp;
+}
diff --git a/programs/develop/libraries/pixman/pixman-trap.c b/programs/develop/libraries/pixman/pixman-trap.c
index 8353992c58..91766fdbfc 100644
--- a/programs/develop/libraries/pixman/pixman-trap.c
+++ b/programs/develop/libraries/pixman/pixman-trap.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright © 2002 Keith Packard, member of The XFree86 Project, Inc.
  * Copyright © 2004 Keith Packard
  *
  * Permission to use, copy, modify, distribute, and sell this software and its
@@ -25,6 +26,7 @@
 #endif
 
 #include <stdio.h>
+#include <stdlib.h>
 #include "pixman-private.h"
 
 /*
@@ -137,7 +139,7 @@ _pixman_edge_multi_init (pixman_edge_t * e,
     if (ne > 0)
     {
 	int nx = ne / e->dy;
-	ne -= nx * e->dy;
+	ne -= nx * (pixman_fixed_48_16_t)e->dy;
 	stepx += nx * e->signdx;
     }
 
@@ -228,14 +230,13 @@ pixman_line_fixed_edge_init (pixman_edge_t *            e,
 }
 
 PIXMAN_EXPORT void
-pixman_add_traps (pixman_image_t * image,
-                  int16_t          x_off,
-                  int16_t          y_off,
-                  int              ntrap,
-                  pixman_trap_t *  traps)
+pixman_add_traps (pixman_image_t *     image,
+                  int16_t              x_off,
+                  int16_t              y_off,
+                  int                  ntrap,
+                  const pixman_trap_t *traps)
 {
     int bpp;
-    int width;
     int height;
 
     pixman_fixed_t x_off_fixed;
@@ -245,7 +246,6 @@ pixman_add_traps (pixman_image_t * image,
 
     _pixman_image_validate (image);
     
-    width = image->bits.width;
     height = image->bits.height;
     bpp = PIXMAN_FORMAT_BPP (image->bits.format);
 
@@ -349,10 +349,8 @@ pixman_rasterize_trapezoid (pixman_image_t *          image,
                             int                       y_off)
 {
     int bpp;
-    int width;
     int height;
 
-    pixman_fixed_t x_off_fixed;
     pixman_fixed_t y_off_fixed;
     pixman_edge_t l, r;
     pixman_fixed_t t, b;
@@ -364,11 +362,9 @@ pixman_rasterize_trapezoid (pixman_image_t *          image,
     if (!pixman_trapezoid_valid (trap))
 	return;
 
-    width = image->bits.width;
     height = image->bits.height;
     bpp = PIXMAN_FORMAT_BPP (image->bits.format);
 
-    x_off_fixed = pixman_int_to_fixed (x_off);
     y_off_fixed = pixman_int_to_fixed (y_off);
 
     t = trap->top + y_off_fixed;
@@ -390,3 +386,326 @@ pixman_rasterize_trapezoid (pixman_image_t *          image,
 	pixman_rasterize_edges (image, &l, &r, t, b);
     }
 }
+
+static const pixman_bool_t zero_src_has_no_effect[PIXMAN_N_OPERATORS] =
+{
+    FALSE,	/* Clear		0			0    */
+    FALSE,	/* Src			1			0    */
+    TRUE,	/* Dst			0			1    */
+    TRUE,	/* Over			1			1-Aa */
+    TRUE,	/* OverReverse		1-Ab			1    */
+    FALSE,	/* In			Ab			0    */
+    FALSE,	/* InReverse		0			Aa   */
+    FALSE,	/* Out			1-Ab			0    */
+    TRUE,	/* OutReverse		0			1-Aa */
+    TRUE,	/* Atop			Ab			1-Aa */
+    FALSE,	/* AtopReverse		1-Ab			Aa   */
+    TRUE,	/* Xor			1-Ab			1-Aa */
+    TRUE,	/* Add			1			1    */
+};
+
+static pixman_bool_t
+get_trap_extents (pixman_op_t op, pixman_image_t *dest,
+		  const pixman_trapezoid_t *traps, int n_traps,
+		  pixman_box32_t *box)
+{
+    int i;
+
+    /* When the operator is such that a zero source has an
+     * effect on the underlying image, we have to
+     * composite across the entire destination
+     */
+    if (!zero_src_has_no_effect [op])
+    {
+	box->x1 = 0;
+	box->y1 = 0;
+	box->x2 = dest->bits.width;
+	box->y2 = dest->bits.height;
+	return TRUE;
+    }
+    
+    box->x1 = INT32_MAX;
+    box->y1 = INT32_MAX;
+    box->x2 = INT32_MIN;
+    box->y2 = INT32_MIN;
+	
+    for (i = 0; i < n_traps; ++i)
+    {
+	const pixman_trapezoid_t *trap = &(traps[i]);
+	int y1, y2;
+	    
+	if (!pixman_trapezoid_valid (trap))
+	    continue;
+	    
+	y1 = pixman_fixed_to_int (trap->top);
+	if (y1 < box->y1)
+	    box->y1 = y1;
+	    
+	y2 = pixman_fixed_to_int (pixman_fixed_ceil (trap->bottom));
+	if (y2 > box->y2)
+	    box->y2 = y2;
+	    
+#define EXTEND_MIN(x)							\
+	if (pixman_fixed_to_int ((x)) < box->x1)			\
+	    box->x1 = pixman_fixed_to_int ((x));
+#define EXTEND_MAX(x)							\
+	if (pixman_fixed_to_int (pixman_fixed_ceil ((x))) > box->x2)	\
+	    box->x2 = pixman_fixed_to_int (pixman_fixed_ceil ((x)));
+	    
+#define EXTEND(x)							\
+	EXTEND_MIN(x);							\
+	EXTEND_MAX(x);
+	    
+	EXTEND(trap->left.p1.x);
+	EXTEND(trap->left.p2.x);
+	EXTEND(trap->right.p1.x);
+	EXTEND(trap->right.p2.x);
+    }
+	
+    if (box->x1 >= box->x2 || box->y1 >= box->y2)
+	return FALSE;
+
+    return TRUE;
+}
+
+/*
+ * pixman_composite_trapezoids()
+ *
+ * All the trapezoids are conceptually rendered to an infinitely big image.
+ * The (0, 0) coordinates of this image are then aligned with the (x, y)
+ * coordinates of the source image, and then both images are aligned with
+ * the (x, y) coordinates of the destination. Then these three images are
+ * composited across the entire destination.
+ */
+PIXMAN_EXPORT void
+pixman_composite_trapezoids (pixman_op_t		op,
+			     pixman_image_t *		src,
+			     pixman_image_t *		dst,
+			     pixman_format_code_t	mask_format,
+			     int			x_src,
+			     int			y_src,
+			     int			x_dst,
+			     int			y_dst,
+			     int			n_traps,
+			     const pixman_trapezoid_t *	traps)
+{
+    int i;
+
+    return_if_fail (PIXMAN_FORMAT_TYPE (mask_format) == PIXMAN_TYPE_A);
+    
+    if (n_traps <= 0)
+	return;
+
+    _pixman_image_validate (src);
+    _pixman_image_validate (dst);
+
+    if (op == PIXMAN_OP_ADD &&
+	(src->common.flags & FAST_PATH_IS_OPAQUE)		&&
+	(mask_format == dst->common.extended_format_code)	&&
+	!(dst->common.have_clip_region))
+    {
+	for (i = 0; i < n_traps; ++i)
+	{
+	    const pixman_trapezoid_t *trap = &(traps[i]);
+	    
+	    if (!pixman_trapezoid_valid (trap))
+		continue;
+	    
+	    pixman_rasterize_trapezoid (dst, trap, x_dst, y_dst);
+	}
+    }
+    else
+    {
+	pixman_image_t *tmp;
+	pixman_box32_t box;
+	int i;
+
+	if (!get_trap_extents (op, dst, traps, n_traps, &box))
+	    return;
+	
+	if (!(tmp = pixman_image_create_bits (
+		  mask_format, box.x2 - box.x1, box.y2 - box.y1, NULL, -1)))
+	    return;
+	
+	for (i = 0; i < n_traps; ++i)
+	{
+	    const pixman_trapezoid_t *trap = &(traps[i]);
+	    
+	    if (!pixman_trapezoid_valid (trap))
+		continue;
+	    
+	    pixman_rasterize_trapezoid (tmp, trap, - box.x1, - box.y1);
+	}
+	
+	pixman_image_composite (op, src, tmp, dst,
+				x_src + box.x1, y_src + box.y1,
+				0, 0,
+				x_dst + box.x1, y_dst + box.y1,
+				box.x2 - box.x1, box.y2 - box.y1);
+	
+	pixman_image_unref (tmp);
+    }
+}
+
+static int
+greater_y (const pixman_point_fixed_t *a, const pixman_point_fixed_t *b)
+{
+    if (a->y == b->y)
+	return a->x > b->x;
+    return a->y > b->y;
+}
+
+/*
+ * Note that the definition of this function is a bit odd because
+ * of the X coordinate space (y increasing downwards).
+ */
+static int
+clockwise (const pixman_point_fixed_t *ref,
+	   const pixman_point_fixed_t *a,
+	   const pixman_point_fixed_t *b)
+{
+    pixman_point_fixed_t	ad, bd;
+
+    ad.x = a->x - ref->x;
+    ad.y = a->y - ref->y;
+    bd.x = b->x - ref->x;
+    bd.y = b->y - ref->y;
+
+    return ((pixman_fixed_32_32_t) bd.y * ad.x -
+	    (pixman_fixed_32_32_t) ad.y * bd.x) < 0;
+}
+
+static void
+triangle_to_trapezoids (const pixman_triangle_t *tri, pixman_trapezoid_t *traps)
+{
+    const pixman_point_fixed_t *top, *left, *right, *tmp;
+
+    top = &tri->p1;
+    left = &tri->p2;
+    right = &tri->p3;
+
+    if (greater_y (top, left))
+    {
+	tmp = left;
+	left = top;
+	top = tmp;
+    }
+
+    if (greater_y (top, right))
+    {
+	tmp = right;
+	right = top;
+	top = tmp;
+    }
+
+    if (clockwise (top, right, left))
+    {
+	tmp = right;
+	right = left;
+	left = tmp;
+    }
+    
+    /*
+     * Two cases:
+     *
+     *		+		+
+     *	       / \             / \
+     *	      /   \           /	  \
+     *	     /     +         +	   \
+     *      /    --           --    \
+     *     /   --               --   \
+     *    / ---                   --- \
+     *	 +--                         --+
+     */
+
+    traps->top = top->y;
+    traps->left.p1 = *top;
+    traps->left.p2 = *left;
+    traps->right.p1 = *top;
+    traps->right.p2 = *right;
+
+    if (right->y < left->y)
+	traps->bottom = right->y;
+    else
+	traps->bottom = left->y;
+
+    traps++;
+
+    *traps = *(traps - 1);
+    
+    if (right->y < left->y)
+    {
+	traps->top = right->y;
+	traps->bottom = left->y;
+	traps->right.p1 = *right;
+	traps->right.p2 = *left;
+    }
+    else
+    {
+	traps->top = left->y;
+	traps->bottom = right->y;
+	traps->left.p1 = *left;
+	traps->left.p2 = *right;
+    }
+}
+
+static pixman_trapezoid_t *
+convert_triangles (int n_tris, const pixman_triangle_t *tris)
+{
+    pixman_trapezoid_t *traps;
+    int i;
+
+    if (n_tris <= 0)
+	return NULL;
+    
+    traps = pixman_malloc_ab (n_tris, 2 * sizeof (pixman_trapezoid_t));
+    if (!traps)
+	return NULL;
+
+    for (i = 0; i < n_tris; ++i)
+	triangle_to_trapezoids (&(tris[i]), traps + 2 * i);
+
+    return traps;
+}
+
+PIXMAN_EXPORT void
+pixman_composite_triangles (pixman_op_t			op,
+			    pixman_image_t *		src,
+			    pixman_image_t *		dst,
+			    pixman_format_code_t	mask_format,
+			    int				x_src,
+			    int				y_src,
+			    int				x_dst,
+			    int				y_dst,
+			    int				n_tris,
+			    const pixman_triangle_t *	tris)
+{
+    pixman_trapezoid_t *traps;
+
+    if ((traps = convert_triangles (n_tris, tris)))
+    {
+	pixman_composite_trapezoids (op, src, dst, mask_format,
+				     x_src, y_src, x_dst, y_dst,
+				     n_tris * 2, traps);
+	
+	free (traps);
+    }
+}
+
+PIXMAN_EXPORT void
+pixman_add_triangles (pixman_image_t          *image,
+		      int32_t	               x_off,
+		      int32_t	               y_off,
+		      int	               n_tris,
+		      const pixman_triangle_t *tris)
+{
+    pixman_trapezoid_t *traps;
+
+    if ((traps = convert_triangles (n_tris, tris)))
+    {
+	pixman_add_trapezoids (image, x_off, y_off,
+			       n_tris * 2, traps);
+
+	free (traps);
+    }
+}
diff --git a/programs/develop/libraries/pixman/pixman-utils.c b/programs/develop/libraries/pixman/pixman-utils.c
index 3ef88b7532..f31171f6d7 100644
--- a/programs/develop/libraries/pixman/pixman-utils.c
+++ b/programs/develop/libraries/pixman/pixman-utils.c
@@ -31,15 +31,19 @@
 #include "pixman-private.h"
 
 pixman_bool_t
-pixman_multiply_overflows_int (unsigned int a,
-                               unsigned int b)
+_pixman_multiply_overflows_size (size_t a, size_t b)
+{
+    return a >= SIZE_MAX / b;
+}
+
+pixman_bool_t
+_pixman_multiply_overflows_int (unsigned int a, unsigned int b)
 {
     return a >= INT32_MAX / b;
 }
 
 pixman_bool_t
-pixman_addition_overflows_int (unsigned int a,
-                               unsigned int b)
+_pixman_addition_overflows_int (unsigned int a, unsigned int b)
 {
     return a > INT32_MAX - b;
 }
@@ -67,61 +71,96 @@ pixman_malloc_abc (unsigned int a,
 	return malloc (a * b * c);
 }
 
-/*
- * Helper routine to expand a color component from 0 < n <= 8 bits to 16
- * bits by replication.
- */
-static inline uint64_t
-expand16 (const uint8_t val, int nbits)
+static force_inline uint16_t
+float_to_unorm (float f, int n_bits)
 {
-    /* Start out with the high bit of val in the high bit of result. */
-    uint16_t result = (uint16_t)val << (16 - nbits);
+    uint32_t u;
 
-    if (nbits == 0)
-	return 0;
+    if (f > 1.0)
+	f = 1.0;
+    if (f < 0.0)
+	f = 0.0;
 
-    /* Copy the bits in result, doubling the number of bits each time, until
-     * we fill all 16 bits.
-     */
-    while (nbits < 16)
-    {
-	result |= result >> nbits;
-	nbits *= 2;
-    }
+    u = f * (1 << n_bits);
+    u -= (u >> n_bits);
 
-    return result;
+    return u;
+}
+
+static force_inline float
+unorm_to_float (uint16_t u, int n_bits)
+{
+    uint32_t m = ((1 << n_bits) - 1);
+
+    return (u & m) * (1.f / (float)m);
 }
 
 /*
- * This function expands images from ARGB8 format to ARGB16.  To preserve
- * precision, it needs to know the original source format.  For example, if the
- * source was PIXMAN_x1r5g5b5 and the red component contained bits 12345, then
- * the expanded value is 12345123.  To correctly expand this to 16 bits, it
- * should be 1234512345123451 and not 1234512312345123.
+ * This function expands images from a8r8g8b8 to argb_t.  To preserve
+ * precision, it needs to know from which source format the a8r8g8b8 pixels
+ * originally came.
+ *
+ * For example, if the source was PIXMAN_x1r5g5b5 and the red component
+ * contained bits 12345, then the 8-bit value is 12345123.  To correctly
+ * expand this to floating point, it should be 12345 / 31.0 and not
+ * 12345123 / 255.0.
  */
 void
-pixman_expand (uint64_t *           dst,
-               const uint32_t *     src,
-               pixman_format_code_t format,
-               int                  width)
+pixman_expand_to_float (argb_t               *dst,
+			const uint32_t       *src,
+			pixman_format_code_t  format,
+			int                   width)
 {
+    static const float multipliers[16] = {
+	0.0f,
+	1.0f / ((1 <<  1) - 1),
+	1.0f / ((1 <<  2) - 1),
+	1.0f / ((1 <<  3) - 1),
+	1.0f / ((1 <<  4) - 1),
+	1.0f / ((1 <<  5) - 1),
+	1.0f / ((1 <<  6) - 1),
+	1.0f / ((1 <<  7) - 1),
+	1.0f / ((1 <<  8) - 1),
+	1.0f / ((1 <<  9) - 1),
+	1.0f / ((1 << 10) - 1),
+	1.0f / ((1 << 11) - 1),
+	1.0f / ((1 << 12) - 1),
+	1.0f / ((1 << 13) - 1),
+	1.0f / ((1 << 14) - 1),
+	1.0f / ((1 << 15) - 1),
+    };
+    int a_size, r_size, g_size, b_size;
+    int a_shift, r_shift, g_shift, b_shift;
+    float a_mul, r_mul, g_mul, b_mul;
+    uint32_t a_mask, r_mask, g_mask, b_mask;
+    int i;
+
+    if (!PIXMAN_FORMAT_VIS (format))
+	format = PIXMAN_a8r8g8b8;
+
     /*
      * Determine the sizes of each component and the masks and shifts
      * required to extract them from the source pixel.
      */
-    const int a_size = PIXMAN_FORMAT_A (format),
-              r_size = PIXMAN_FORMAT_R (format),
-              g_size = PIXMAN_FORMAT_G (format),
-              b_size = PIXMAN_FORMAT_B (format);
-    const int a_shift = 32 - a_size,
-              r_shift = 24 - r_size,
-              g_shift = 16 - g_size,
-              b_shift =  8 - b_size;
-    const uint8_t a_mask = ~(~0 << a_size),
-                  r_mask = ~(~0 << r_size),
-                  g_mask = ~(~0 << g_size),
-                  b_mask = ~(~0 << b_size);
-    int i;
+    a_size = PIXMAN_FORMAT_A (format);
+    r_size = PIXMAN_FORMAT_R (format);
+    g_size = PIXMAN_FORMAT_G (format);
+    b_size = PIXMAN_FORMAT_B (format);
+
+    a_shift = 32 - a_size;
+    r_shift = 24 - r_size;
+    g_shift = 16 - g_size;
+    b_shift =  8 - b_size;
+
+    a_mask = ((1 << a_size) - 1);
+    r_mask = ((1 << r_size) - 1);
+    g_mask = ((1 << g_size) - 1);
+    b_mask = ((1 << b_size) - 1);
+
+    a_mul = multipliers[a_size];
+    r_mul = multipliers[r_size];
+    g_mul = multipliers[g_size];
+    b_mul = multipliers[b_size];
 
     /* Start at the end so that we can do the expansion in place
      * when src == dst
@@ -129,44 +168,52 @@ pixman_expand (uint64_t *           dst,
     for (i = width - 1; i >= 0; i--)
     {
 	const uint32_t pixel = src[i];
-	const uint8_t a = (pixel >> a_shift) & a_mask,
-	              r = (pixel >> r_shift) & r_mask,
-	              g = (pixel >> g_shift) & g_mask,
-	              b = (pixel >> b_shift) & b_mask;
-	const uint64_t a16 = a_size ? expand16 (a, a_size) : 0xffff,
-	               r16 = expand16 (r, r_size),
-	               g16 = expand16 (g, g_size),
-	               b16 = expand16 (b, b_size);
 
-	dst[i] = a16 << 48 | r16 << 32 | g16 << 16 | b16;
+	dst[i].a = a_mask? ((pixel >> a_shift) & a_mask) * a_mul : 1.0f;
+	dst[i].r = ((pixel >> r_shift) & r_mask) * r_mul;
+	dst[i].g = ((pixel >> g_shift) & g_mask) * g_mul;
+	dst[i].b = ((pixel >> b_shift) & b_mask) * b_mul;
     }
 }
 
-/*
- * Contracting is easier than expanding.  We just need to truncate the
- * components.
- */
+uint16_t
+pixman_float_to_unorm (float f, int n_bits)
+{
+    return float_to_unorm (f, n_bits);
+}
+
+float
+pixman_unorm_to_float (uint16_t u, int n_bits)
+{
+    return unorm_to_float (u, n_bits);
+}
+
 void
-pixman_contract (uint32_t *      dst,
-                 const uint64_t *src,
-                 int             width)
+pixman_contract_from_float (uint32_t     *dst,
+			    const argb_t *src,
+			    int           width)
 {
     int i;
 
-    /* Start at the beginning so that we can do the contraction in
-     * place when src == dst
-     */
-    for (i = 0; i < width; i++)
+    for (i = 0; i < width; ++i)
     {
-	const uint8_t a = src[i] >> 56,
-	              r = src[i] >> 40,
-	              g = src[i] >> 24,
-	              b = src[i] >> 8;
+	uint8_t a, r, g, b;
 
-	dst[i] = a << 24 | r << 16 | g << 8 | b;
+	a = float_to_unorm (src[i].a, 8);
+	r = float_to_unorm (src[i].r, 8);
+	g = float_to_unorm (src[i].g, 8);
+	b = float_to_unorm (src[i].b, 8);
+
+	dst[i] = (a << 24) | (r << 16) | (g << 8) | (b << 0);
     }
 }
 
+uint32_t *
+_pixman_iter_get_scanline_noop (pixman_iter_t *iter, const uint32_t *mask)
+{
+    return iter->buffer;
+}
+
 #define N_TMP_BOXES (16)
 
 pixman_bool_t
@@ -236,7 +283,14 @@ pixman_region32_copy_from_region16 (pixman_region32_t *dst,
     return retval;
 }
 
-#ifdef DEBUG
+/* This function is exported for the sake of the test suite and not part
+ * of the ABI.
+ */
+PIXMAN_EXPORT pixman_implementation_t *
+_pixman_internal_only_get_implementation (void)
+{
+    return get_implementation ();
+}
 
 void
 _pixman_log_error (const char *function, const char *message)
@@ -254,5 +308,3 @@ _pixman_log_error (const char *function, const char *message)
 	n_messages++;
     }
 }
-
-#endif
diff --git a/programs/develop/libraries/pixman/pixman-version.h b/programs/develop/libraries/pixman/pixman-version.h
index 172beea370..404227f984 100644
--- a/programs/develop/libraries/pixman/pixman-version.h
+++ b/programs/develop/libraries/pixman/pixman-version.h
@@ -32,10 +32,10 @@
 #endif
 
 #define PIXMAN_VERSION_MAJOR 0
-#define PIXMAN_VERSION_MINOR 20
+#define PIXMAN_VERSION_MINOR 30
 #define PIXMAN_VERSION_MICRO 2
 
-#define PIXMAN_VERSION_STRING "0.20.2"
+#define PIXMAN_VERSION_STRING "0.30.2"
 
 #define PIXMAN_VERSION_ENCODE(major, minor, micro) (	\
 	  ((major) * 10000)				\
diff --git a/programs/develop/libraries/pixman/pixman-x64-mmx-emulation.h b/programs/develop/libraries/pixman/pixman-x64-mmx-emulation.h
deleted file mode 100644
index 378019cf27..0000000000
--- a/programs/develop/libraries/pixman/pixman-x64-mmx-emulation.h
+++ /dev/null
@@ -1,263 +0,0 @@
-#ifndef MMX_X64_H_INCLUDED
-#define MMX_X64_H_INCLUDED
-
-/* Implementation of x64 MMX substitition functions, before
- * pixman is reimplemented not to use __m64 type on Visual C++
- *
- * Copyright (C)2009 by George Yohng
- * Released in public domain.
- */
-
-#include <intrin.h>
-
-#define M64C(a) (*(const __m64 *)(&a))
-#define M64U(a) (*(const unsigned long long *)(&a))
-
-__inline __m64
-_m_from_int (int a)
-{
-    long long i64 = a;
-
-    return M64C (i64);
-}
-
-__inline __m64
-_mm_setzero_si64 ()
-{
-    long long i64 = 0;
-
-    return M64C (i64);
-}
-
-__inline __m64
-_mm_set_pi32 (int i1,   int i0)
-{
-    unsigned long long i64 = ((unsigned)i0) + (((unsigned long long)(unsigned)i1) << 32);
-
-    return M64C (i64);
-}
-
-__inline void
-_m_empty ()
-{
-}
-
-__inline __m64
-_mm_set1_pi16 (short w)
-{
-    unsigned long long i64 = ((unsigned long long)(unsigned short)(w)) * 0x0001000100010001ULL;
-
-    return M64C (i64);
-}
-
-__inline int
-_m_to_int (__m64 m)
-{
-    return m.m64_i32[0];
-}
-
-__inline __m64
-_mm_movepi64_pi64 (__m128i a)
-{
-    return M64C (a.m128i_i64[0]);
-}
-
-__inline __m64
-_m_pand (__m64 a, __m64 b)
-{
-    unsigned long long i64 = M64U (a) & M64U (b);
-
-    return M64C (i64);
-}
-
-__inline __m64
-_m_por (__m64 a, __m64 b)
-{
-    unsigned long long i64 = M64U (a) | M64U (b);
-
-    return M64C (i64);
-}
-
-__inline __m64
-_m_pxor (__m64 a, __m64 b)
-{
-    unsigned long long i64 = M64U (a) ^ M64U (b);
-
-    return M64C (i64);
-}
-
-__inline __m64
-_m_pmulhuw (__m64 a, __m64 b)        /* unoptimized */
-{
-    unsigned short d[4] =
-    {
-	(unsigned short)((((unsigned)a.m64_u16[0]) * b.m64_u16[0]) >> 16),
-	(unsigned short)((((unsigned)a.m64_u16[1]) * b.m64_u16[1]) >> 16),
-	(unsigned short)((((unsigned)a.m64_u16[2]) * b.m64_u16[2]) >> 16),
-	(unsigned short)((((unsigned)a.m64_u16[3]) * b.m64_u16[3]) >> 16)
-    };
-
-    return M64C (d[0]);
-}
-
-__inline __m64
-_m_pmullw2 (__m64 a, __m64 b)        /* unoptimized */
-{
-    unsigned short d[4] =
-    {
-	(unsigned short)((((unsigned)a.m64_u16[0]) * b.m64_u16[0])),
-	(unsigned short)((((unsigned)a.m64_u16[1]) * b.m64_u16[1])),
-	(unsigned short)((((unsigned)a.m64_u16[2]) * b.m64_u16[2])),
-	(unsigned short)((((unsigned)a.m64_u16[3]) * b.m64_u16[3]))
-    };
-
-    return M64C (d[0]);
-}
-
-__inline __m64
-_m_pmullw (__m64 a, __m64 b)        /* unoptimized */
-{
-    unsigned long long x =
-	((unsigned long long)(unsigned short)((((unsigned)a.m64_u16[0]) * b.m64_u16[0])))  +
-	(((unsigned long long)(unsigned short)((((unsigned)a.m64_u16[1]) * b.m64_u16[1]))) << 16)  +
-	(((unsigned long long)(unsigned short)((((unsigned)a.m64_u16[2]) * b.m64_u16[2]))) << 32)  +
-	(((unsigned long long)(unsigned short)((((unsigned)a.m64_u16[3]) * b.m64_u16[3]))) << 48);
-
-    return M64C (x);
-}
-
-__inline __m64
-_m_paddusb (__m64 a, __m64 b)        /* unoptimized */
-{
-    unsigned long long x = (M64U (a) & 0x00FF00FF00FF00FFULL) +
-                           (M64U (b) & 0x00FF00FF00FF00FFULL);
-
-    unsigned long long y = ((M64U (a) >> 8) & 0x00FF00FF00FF00FFULL) +
-                           ((M64U (b) >> 8) & 0x00FF00FF00FF00FFULL);
-
-    x |= ((x & 0xFF00FF00FF00FF00ULL) >> 8) * 0xFF;
-    y |= ((y & 0xFF00FF00FF00FF00ULL) >> 8) * 0xFF;
-
-    x = (x & 0x00FF00FF00FF00FFULL) | ((y & 0x00FF00FF00FF00FFULL) << 8);
-
-    return M64C (x);
-}
-
-__inline __m64
-_m_paddusw (__m64 a, __m64 b)        /* unoptimized */
-{
-    unsigned long long x = (M64U (a) & 0x0000FFFF0000FFFFULL) +
-                           (M64U (b) & 0x0000FFFF0000FFFFULL);
-
-    unsigned long long y = ((M64U (a) >> 16) & 0x0000FFFF0000FFFFULL) +
-                           ((M64U (b) >> 16) & 0x0000FFFF0000FFFFULL);
-
-    x |= ((x & 0xFFFF0000FFFF0000) >> 16) * 0xFFFF;
-    y |= ((y & 0xFFFF0000FFFF0000) >> 16) * 0xFFFF;
-
-    x = (x & 0x0000FFFF0000FFFFULL) | ((y & 0x0000FFFF0000FFFFULL) << 16);
-
-    return M64C (x);
-}
-
-__inline __m64
-_m_pshufw (__m64 a, int n)         /* unoptimized */
-{
-    unsigned short d[4] =
-    {
-	a.m64_u16[n & 3],
-	a.m64_u16[(n >> 2) & 3],
-	a.m64_u16[(n >> 4) & 3],
-	a.m64_u16[(n >> 6) & 3]
-    };
-
-    return M64C (d[0]);
-}
-
-__inline unsigned char
-sat16 (unsigned short d)
-{
-    if (d > 0xFF) return 0xFF;
-    else return d & 0xFF;
-}
-
-__inline __m64
-_m_packuswb (__m64 m1, __m64 m2)          /* unoptimized */
-{
-    unsigned char d[8] =
-    {
-	sat16 (m1.m64_u16[0]),
-	sat16 (m1.m64_u16[1]),
-	sat16 (m1.m64_u16[2]),
-	sat16 (m1.m64_u16[3]),
-	sat16 (m2.m64_u16[0]),
-	sat16 (m2.m64_u16[1]),
-	sat16 (m2.m64_u16[2]),
-	sat16 (m2.m64_u16[3])
-    };
-
-    return M64C (d[0]);
-}
-
-__inline __m64 _m_punpcklbw (__m64 m1, __m64 m2)          /* unoptimized */
-{
-    unsigned char d[8] =
-    {
-	m1.m64_u8[0],
-	m2.m64_u8[0],
-	m1.m64_u8[1],
-	m2.m64_u8[1],
-	m1.m64_u8[2],
-	m2.m64_u8[2],
-	m1.m64_u8[3],
-	m2.m64_u8[3],
-    };
-
-    return M64C (d[0]);
-}
-
-__inline __m64 _m_punpckhbw (__m64 m1, __m64 m2)          /* unoptimized */
-{
-    unsigned char d[8] =
-    {
-	m1.m64_u8[4],
-	m2.m64_u8[4],
-	m1.m64_u8[5],
-	m2.m64_u8[5],
-	m1.m64_u8[6],
-	m2.m64_u8[6],
-	m1.m64_u8[7],
-	m2.m64_u8[7],
-    };
-
-    return M64C (d[0]);
-}
-
-__inline __m64 _m_psrlwi (__m64 a, int n)       /* unoptimized */
-{
-    unsigned short d[4] =
-    {
-	a.m64_u16[0] >> n,
-	a.m64_u16[1] >> n,
-	a.m64_u16[2] >> n,
-	a.m64_u16[3] >> n
-    };
-
-    return M64C (d[0]);
-}
-
-__inline __m64 _m_psrlqi (__m64 m, int n)
-{
-    unsigned long long x = M64U (m) >> n;
-
-    return M64C (x);
-}
-
-__inline __m64 _m_psllqi (__m64 m, int n)
-{
-    unsigned long long x = M64U (m) << n;
-
-    return M64C (x);
-}
-
-#endif /* MMX_X64_H_INCLUDED */
diff --git a/programs/develop/libraries/pixman/pixman-x86.c b/programs/develop/libraries/pixman/pixman-x86.c
new file mode 100644
index 0000000000..57e4d1f351
--- /dev/null
+++ b/programs/develop/libraries/pixman/pixman-x86.c
@@ -0,0 +1,237 @@
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-private.h"
+
+#if defined(USE_X86_MMX) || defined (USE_SSE2)
+
+/* The CPU detection code needs to be in a file not compiled with
+ * "-mmmx -msse", as gcc would generate CMOV instructions otherwise
+ * that would lead to SIGILL instructions on old CPUs that don't have
+ * it.
+ */
+
+typedef enum
+{
+    X86_MMX			= (1 << 0),
+    X86_MMX_EXTENSIONS		= (1 << 1),
+    X86_SSE			= (1 << 2) | X86_MMX_EXTENSIONS,
+    X86_SSE2			= (1 << 3),
+    X86_CMOV			= (1 << 4)
+} cpu_features_t;
+
+#ifdef HAVE_GETISAX
+
+#include <sys/auxv.h>
+
+static cpu_features_t
+detect_cpu_features (void)
+{
+    cpu_features_t features = 0;
+    unsigned int result = 0;
+
+    if (getisax (&result, 1))
+    {
+	if (result & AV_386_CMOV)
+	    features |= X86_CMOV;
+	if (result & AV_386_MMX)
+	    features |= X86_MMX;
+	if (result & AV_386_AMD_MMX)
+	    features |= X86_MMX_EXTENSIONS;
+	if (result & AV_386_SSE)
+	    features |= X86_SSE;
+	if (result & AV_386_SSE2)
+	    features |= X86_SSE2;
+    }
+
+    return features;
+}
+
+#else
+
+#define _PIXMAN_X86_64							\
+    (defined(__amd64__) || defined(__x86_64__) || defined(_M_AMD64))
+
+static pixman_bool_t
+have_cpuid (void)
+{
+#if _PIXMAN_X86_64 || defined (_MSC_VER)
+
+    return TRUE;
+
+#elif defined (__GNUC__)
+    uint32_t result;
+
+    __asm__ volatile (
+        "pushf"				"\n\t"
+        "pop %%eax"			"\n\t"
+        "mov %%eax, %%ecx"		"\n\t"
+        "xor $0x00200000, %%eax"	"\n\t"
+        "push %%eax"			"\n\t"
+        "popf"				"\n\t"
+        "pushf"				"\n\t"
+        "pop %%eax"			"\n\t"
+        "xor %%ecx, %%eax"		"\n\t"
+	"mov %%eax, %0"			"\n\t"
+	: "=r" (result)
+	:
+	: "%eax", "%ecx");
+
+    return !!result;
+
+#else
+#error "Unknown compiler"
+#endif
+}
+
+static void
+pixman_cpuid (uint32_t feature,
+	      uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d)
+{
+#if defined (__GNUC__)
+
+#if _PIXMAN_X86_64
+    __asm__ volatile (
+        "cpuid"				"\n\t"
+	: "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d)
+	: "a" (feature));
+#else
+    /* On x86-32 we need to be careful about the handling of %ebx
+     * and %esp. We can't declare either one as clobbered
+     * since they are special registers (%ebx is the "PIC
+     * register" holding an offset to global data, %esp the
+     * stack pointer), so we need to make sure that %ebx is
+     * preserved, and that %esp has its original value when
+     * accessing the output operands.
+     */
+    __asm__ volatile (
+	"xchg %%ebx, %1"		"\n\t"
+	"cpuid"				"\n\t"
+	"xchg %%ebx, %1"		"\n\t"
+	: "=a" (*a), "=r" (*b), "=c" (*c), "=d" (*d)
+	: "a" (feature));
+#endif
+
+#elif defined (_MSC_VER)
+    int info[4];
+
+    __cpuid (info, feature);
+
+    *a = info[0];
+    *b = info[1];
+    *c = info[2];
+    *d = info[3];
+#else
+#error Unknown compiler
+#endif
+}
+
+static cpu_features_t
+detect_cpu_features (void)
+{
+    uint32_t a, b, c, d;
+    cpu_features_t features = 0;
+
+    if (!have_cpuid())
+	return features;
+
+    /* Get feature bits */
+    pixman_cpuid (0x01, &a, &b, &c, &d);
+    if (d & (1 << 15))
+	features |= X86_CMOV;
+    if (d & (1 << 23))
+	features |= X86_MMX;
+    if (d & (1 << 25))
+	features |= X86_SSE;
+    if (d & (1 << 26))
+	features |= X86_SSE2;
+
+    /* Check for AMD specific features */
+    if ((features & X86_MMX) && !(features & X86_SSE))
+    {
+	char vendor[13];
+
+	/* Get vendor string */
+	memset (vendor, 0, sizeof vendor);
+
+	pixman_cpuid (0x00, &a, &b, &c, &d);
+	memcpy (vendor + 0, &b, 4);
+	memcpy (vendor + 4, &d, 4);
+	memcpy (vendor + 8, &c, 4);
+
+	if (strcmp (vendor, "AuthenticAMD") == 0 ||
+	    strcmp (vendor, "Geode by NSC") == 0)
+	{
+	    pixman_cpuid (0x80000000, &a, &b, &c, &d);
+	    if (a >= 0x80000001)
+	    {
+		pixman_cpuid (0x80000001, &a, &b, &c, &d);
+
+		if (d & (1 << 22))
+		    features |= X86_MMX_EXTENSIONS;
+	    }
+	}
+    }
+
+    return features;
+}
+
+#endif
+
+static pixman_bool_t
+have_feature (cpu_features_t feature)
+{
+    static pixman_bool_t initialized;
+    static cpu_features_t features;
+
+    if (!initialized)
+    {
+	features = detect_cpu_features();
+	initialized = TRUE;
+    }
+
+    return (features & feature) == feature;
+}
+
+#endif
+
+pixman_implementation_t *
+_pixman_x86_get_implementations (pixman_implementation_t *imp)
+{
+#define MMX_BITS  (X86_MMX | X86_MMX_EXTENSIONS)
+#define SSE2_BITS (X86_MMX | X86_MMX_EXTENSIONS | X86_SSE | X86_SSE2)
+
+#ifdef USE_X86_MMX
+    if (!_pixman_disabled ("mmx") && have_feature (MMX_BITS))
+	imp = _pixman_implementation_create_mmx (imp);
+#endif
+
+#ifdef USE_SSE2
+    if (!_pixman_disabled ("sse2") && have_feature (SSE2_BITS))
+	imp = _pixman_implementation_create_sse2 (imp);
+#endif
+
+    return imp;
+}
diff --git a/programs/develop/libraries/pixman/pixman.c b/programs/develop/libraries/pixman/pixman.c
index 3a62b2d7df..184f0c4e6a 100644
--- a/programs/develop/libraries/pixman/pixman.c
+++ b/programs/develop/libraries/pixman/pixman.c
@@ -30,16 +30,15 @@
 
 #include <stdlib.h>
 
-static force_inline pixman_implementation_t *
-get_implementation (void)
+pixman_implementation_t *global_implementation;
+
+#ifdef TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR
+static void __attribute__((constructor))
+pixman_constructor (void)
 {
-    static pixman_implementation_t *global_implementation;
-
-    if (!global_implementation)
-	global_implementation = _pixman_choose_implementation ();
-
-    return global_implementation;
+    global_implementation = _pixman_choose_implementation ();
 }
+#endif
 
 typedef struct operator_info_t operator_info_t;
 
@@ -153,57 +152,6 @@ optimize_operator (pixman_op_t     op,
     return operator_table[op].opaque_info[is_dest_opaque | is_source_opaque];
 }
 
-static void
-apply_workaround (pixman_image_t *image,
-		  int32_t *       x,
-		  int32_t *       y,
-		  uint32_t **     save_bits,
-		  int *           save_dx,
-		  int *           save_dy)
-{
-    if (image && (image->common.flags & FAST_PATH_NEEDS_WORKAROUND))
-    {
-	/* Some X servers generate images that point to the
-	 * wrong place in memory, but then set the clip region
-	 * to point to the right place. Because of an old bug
-	 * in pixman, this would actually work.
-	 *
-	 * Here we try and undo the damage
-	 */
-	int bpp = PIXMAN_FORMAT_BPP (image->bits.format) / 8;
-	pixman_box32_t *extents;
-	uint8_t *t;
-	int dx, dy;
-	
-	extents = pixman_region32_extents (&(image->common.clip_region));
-	dx = extents->x1;
-	dy = extents->y1;
-	
-	*save_bits = image->bits.bits;
-	
-	*x -= dx;
-	*y -= dy;
-	pixman_region32_translate (&(image->common.clip_region), -dx, -dy);
-	
-	t = (uint8_t *)image->bits.bits;
-	t += dy * image->bits.rowstride * 4 + dx * bpp;
-	image->bits.bits = (uint32_t *)t;
-	
-	*save_dx = dx;
-	*save_dy = dy;
-    }
-}
-
-static void
-unapply_workaround (pixman_image_t *image, uint32_t *bits, int dx, int dy)
-{
-    if (image && (image->common.flags & FAST_PATH_NEEDS_WORKAROUND))
-    {
-	image->bits.bits = bits;
-	pixman_region32_translate (&image->common.clip_region, dx, dy);
-    }
-}
-
 /*
  * Computing composite region
  */
@@ -276,19 +224,19 @@ clip_source_image (pixman_region32_t * region,
  * returns FALSE if the final region is empty.  Indistinguishable from
  * an allocation failure, but rendering ignores those anyways.
  */
-static pixman_bool_t
-pixman_compute_composite_region32 (pixman_region32_t * region,
-                                   pixman_image_t *    src_image,
-                                   pixman_image_t *    mask_image,
-                                   pixman_image_t *    dst_image,
-                                   int32_t             src_x,
-                                   int32_t             src_y,
-                                   int32_t             mask_x,
-                                   int32_t             mask_y,
-                                   int32_t             dest_x,
-                                   int32_t             dest_y,
-                                   int32_t             width,
-                                   int32_t             height)
+pixman_bool_t
+_pixman_compute_composite_region32 (pixman_region32_t * region,
+				    pixman_image_t *    src_image,
+				    pixman_image_t *    mask_image,
+				    pixman_image_t *    dest_image,
+				    int32_t             src_x,
+				    int32_t             src_y,
+				    int32_t             mask_x,
+				    int32_t             mask_y,
+				    int32_t             dest_x,
+				    int32_t             dest_y,
+				    int32_t             width,
+				    int32_t             height)
 {
     region->extents.x1 = dest_x;
     region->extents.x2 = dest_x + width;
@@ -297,8 +245,8 @@ pixman_compute_composite_region32 (pixman_region32_t * region,
 
     region->extents.x1 = MAX (region->extents.x1, 0);
     region->extents.y1 = MAX (region->extents.y1, 0);
-    region->extents.x2 = MIN (region->extents.x2, dst_image->bits.width);
-    region->extents.y2 = MIN (region->extents.y2, dst_image->bits.height);
+    region->extents.x2 = MIN (region->extents.x2, dest_image->bits.width);
+    region->extents.y2 = MIN (region->extents.y2, dest_image->bits.height);
 
     region->data = 0;
 
@@ -313,29 +261,29 @@ pixman_compute_composite_region32 (pixman_region32_t * region,
 	return FALSE;
     }
 
-    if (dst_image->common.have_clip_region)
+    if (dest_image->common.have_clip_region)
     {
-	if (!clip_general_image (region, &dst_image->common.clip_region, 0, 0))
+	if (!clip_general_image (region, &dest_image->common.clip_region, 0, 0))
 	    return FALSE;
     }
 
-    if (dst_image->common.alpha_map)
+    if (dest_image->common.alpha_map)
     {
 	if (!pixman_region32_intersect_rect (region, region,
-					     dst_image->common.alpha_origin_x,
-					     dst_image->common.alpha_origin_y,
-					     dst_image->common.alpha_map->width,
-					     dst_image->common.alpha_map->height))
+					     dest_image->common.alpha_origin_x,
+					     dest_image->common.alpha_origin_y,
+					     dest_image->common.alpha_map->width,
+					     dest_image->common.alpha_map->height))
 	{
 	    return FALSE;
 	}
 	if (!pixman_region32_not_empty (region))
 	    return FALSE;
-	if (dst_image->common.alpha_map->common.have_clip_region)
+	if (dest_image->common.alpha_map->common.have_clip_region)
 	{
-	    if (!clip_general_image (region, &dst_image->common.alpha_map->common.clip_region,
-				     -dst_image->common.alpha_origin_x,
-				     -dst_image->common.alpha_origin_y))
+	    if (!clip_general_image (region, &dest_image->common.alpha_map->common.clip_region,
+				     -dest_image->common.alpha_origin_x,
+				     -dest_image->common.alpha_origin_y))
 	    {
 		return FALSE;
 	    }
@@ -377,220 +325,89 @@ pixman_compute_composite_region32 (pixman_region32_t * region,
     return TRUE;
 }
 
-#define N_CACHED_FAST_PATHS 8
-
 typedef struct
 {
-    struct
-    {
-	pixman_implementation_t *	imp;
-	pixman_fast_path_t		fast_path;
-    } cache [N_CACHED_FAST_PATHS];
-} cache_t;
+    pixman_fixed_48_16_t	x1;
+    pixman_fixed_48_16_t	y1;
+    pixman_fixed_48_16_t	x2;
+    pixman_fixed_48_16_t	y2;
+} box_48_16_t;
 
-PIXMAN_DEFINE_THREAD_LOCAL (cache_t, fast_path_cache);
-
-static force_inline pixman_bool_t
-lookup_composite_function (pixman_op_t			op,
-			   pixman_format_code_t		src_format,
-			   uint32_t			src_flags,
-			   pixman_format_code_t		mask_format,
-			   uint32_t			mask_flags,
-			   pixman_format_code_t		dest_format,
-			   uint32_t			dest_flags,
-			   pixman_implementation_t    **out_imp,
-			   pixman_composite_func_t     *out_func)
+static pixman_bool_t
+compute_transformed_extents (pixman_transform_t *transform,
+			     const pixman_box32_t *extents,
+			     box_48_16_t *transformed)
 {
-    pixman_implementation_t *imp;
-    cache_t *cache;
+    pixman_fixed_48_16_t tx1, ty1, tx2, ty2;
+    pixman_fixed_t x1, y1, x2, y2;
     int i;
 
-    /* Check cache for fast paths */
-    cache = PIXMAN_GET_THREAD_LOCAL (fast_path_cache);
+    x1 = pixman_int_to_fixed (extents->x1) + pixman_fixed_1 / 2;
+    y1 = pixman_int_to_fixed (extents->y1) + pixman_fixed_1 / 2;
+    x2 = pixman_int_to_fixed (extents->x2) - pixman_fixed_1 / 2;
+    y2 = pixman_int_to_fixed (extents->y2) - pixman_fixed_1 / 2;
 
-    for (i = 0; i < N_CACHED_FAST_PATHS; ++i)
+    if (!transform)
     {
-	const pixman_fast_path_t *info = &(cache->cache[i].fast_path);
+	transformed->x1 = x1;
+	transformed->y1 = y1;
+	transformed->x2 = x2;
+	transformed->y2 = y2;
 
-	/* Note that we check for equality here, not whether
-	 * the cached fast path matches. This is to prevent
-	 * us from selecting an overly general fast path
-	 * when a more specific one would work.
-	 */
-	if (info->op == op			&&
-	    info->src_format == src_format	&&
-	    info->mask_format == mask_format	&&
-	    info->dest_format == dest_format	&&
-	    info->src_flags == src_flags	&&
-	    info->mask_flags == mask_flags	&&
-	    info->dest_flags == dest_flags	&&
-	    info->func)
-	{
-	    *out_imp = cache->cache[i].imp;
-	    *out_func = cache->cache[i].fast_path.func;
-
-	    goto update_cache;
-	}
+	return TRUE;
     }
 
-    for (imp = get_implementation (); imp != NULL; imp = imp->delegate)
+    tx1 = ty1 = INT64_MAX;
+    tx2 = ty2 = INT64_MIN;
+
+    for (i = 0; i < 4; ++i)
     {
-	const pixman_fast_path_t *info = imp->fast_paths;
+	pixman_fixed_48_16_t tx, ty;
+	pixman_vector_t v;
 
-	while (info->op != PIXMAN_OP_NONE)
-	{
-	    if ((info->op == op || info->op == PIXMAN_OP_any)		&&
-		/* Formats */
-		((info->src_format == src_format) ||
-		 (info->src_format == PIXMAN_any))			&&
-		((info->mask_format == mask_format) ||
-		 (info->mask_format == PIXMAN_any))			&&
-		((info->dest_format == dest_format) ||
-		 (info->dest_format == PIXMAN_any))			&&
-		/* Flags */
-		(info->src_flags & src_flags) == info->src_flags	&&
-		(info->mask_flags & mask_flags) == info->mask_flags	&&
-		(info->dest_flags & dest_flags) == info->dest_flags)
-	    {
-		*out_imp = imp;
-		*out_func = info->func;
+	v.vector[0] = (i & 0x01)? x1 : x2;
+	v.vector[1] = (i & 0x02)? y1 : y2;
+	v.vector[2] = pixman_fixed_1;
 
-		/* Set i to the last spot in the cache so that the
-		 * move-to-front code below will work
-		 */
-		i = N_CACHED_FAST_PATHS - 1;
+	if (!pixman_transform_point (transform, &v))
+	    return FALSE;
 
-		goto update_cache;
-	    }
+	tx = (pixman_fixed_48_16_t)v.vector[0];
+	ty = (pixman_fixed_48_16_t)v.vector[1];
 
-	    ++info;
-	}
+	if (tx < tx1)
+	    tx1 = tx;
+	if (ty < ty1)
+	    ty1 = ty;
+	if (tx > tx2)
+	    tx2 = tx;
+	if (ty > ty2)
+	    ty2 = ty;
     }
-    return FALSE;
 
-update_cache:
-    if (i)
-    {
-	while (i--)
-	    cache->cache[i + 1] = cache->cache[i];
-
-	cache->cache[0].imp = *out_imp;
-	cache->cache[0].fast_path.op = op;
-	cache->cache[0].fast_path.src_format = src_format;
-	cache->cache[0].fast_path.src_flags = src_flags;
-	cache->cache[0].fast_path.mask_format = mask_format;
-	cache->cache[0].fast_path.mask_flags = mask_flags;
-	cache->cache[0].fast_path.dest_format = dest_format;
-	cache->cache[0].fast_path.dest_flags = dest_flags;
-	cache->cache[0].fast_path.func = *out_func;
-    }
+    transformed->x1 = tx1;
+    transformed->y1 = ty1;
+    transformed->x2 = tx2;
+    transformed->y2 = ty2;
 
     return TRUE;
 }
 
-static pixman_bool_t
-compute_sample_extents (pixman_transform_t *transform,
-			pixman_box32_t *extents, int x, int y, 
-			pixman_fixed_t x_off, pixman_fixed_t y_off,
-			pixman_fixed_t width, pixman_fixed_t height)
-{
-    pixman_fixed_t x1, y1, x2, y2;
-    pixman_fixed_48_16_t tx1, ty1, tx2, ty2;
-
-    /* We have checked earlier that (extents->x1 - x) etc. fit in a pixman_fixed_t */
-    x1 = (pixman_fixed_48_16_t)pixman_int_to_fixed (extents->x1 - x) + pixman_fixed_1 / 2;
-    y1 = (pixman_fixed_48_16_t)pixman_int_to_fixed (extents->y1 - y) + pixman_fixed_1 / 2;
-    x2 = (pixman_fixed_48_16_t)pixman_int_to_fixed (extents->x2 - x) - pixman_fixed_1 / 2;
-    y2 = (pixman_fixed_48_16_t)pixman_int_to_fixed (extents->y2 - y) - pixman_fixed_1 / 2;
-
-    if (!transform)
-    {
-	tx1 = (pixman_fixed_48_16_t)x1;
-	ty1 = (pixman_fixed_48_16_t)y1;
-	tx2 = (pixman_fixed_48_16_t)x2;
-	ty2 = (pixman_fixed_48_16_t)y2;
-    }
-    else
-    {
-	int i;
-
-	/* Silence GCC */
-	tx1 = ty1 = tx2 = ty2 = 0;
-    
-	for (i = 0; i < 4; ++i)
-	{
-	    pixman_fixed_48_16_t tx, ty;
-	    pixman_vector_t v;
-
-	    v.vector[0] = (i & 0x01)? x1 : x2;
-	    v.vector[1] = (i & 0x02)? y1 : y2;
-	    v.vector[2] = pixman_fixed_1;
-
-	    if (!pixman_transform_point (transform, &v))
-		return FALSE;
-
-	    tx = (pixman_fixed_48_16_t)v.vector[0];
-	    ty = (pixman_fixed_48_16_t)v.vector[1];
-
-	    if (i == 0)
-	    {
-		tx1 = tx;
-		ty1 = ty;
-		tx2 = tx;
-		ty2 = ty;
-	    }
-	    else
-	    {
-		if (tx < tx1)
-		    tx1 = tx;
-		if (ty < ty1)
-		    ty1 = ty;
-		if (tx > tx2)
-		    tx2 = tx;
-		if (ty > ty2)
-		    ty2 = ty;
-	    }
-	}
-    }
-
-    /* Expand the source area by a tiny bit so account of different rounding that
-     * may happen during sampling. Note that (8 * pixman_fixed_e) is very far from
-     * 0.5 so this won't cause the area computed to be overly pessimistic.
-     */
-    tx1 += x_off - 8 * pixman_fixed_e;
-    ty1 += y_off - 8 * pixman_fixed_e;
-    tx2 += x_off + width + 8 * pixman_fixed_e;
-    ty2 += y_off + height + 8 * pixman_fixed_e;
-
-    if (tx1 < pixman_min_fixed_48_16 || tx1 > pixman_max_fixed_48_16 ||
-	ty1 < pixman_min_fixed_48_16 || ty1 > pixman_max_fixed_48_16 ||
-	tx2 < pixman_min_fixed_48_16 || tx2 > pixman_max_fixed_48_16 ||
-	ty2 < pixman_min_fixed_48_16 || ty2 > pixman_max_fixed_48_16)
-    {
-	return FALSE;
-    }
-    else
-    {
-	extents->x1 = pixman_fixed_to_int (tx1);
-	extents->y1 = pixman_fixed_to_int (ty1);
-	extents->x2 = pixman_fixed_to_int (tx2) + 1;
-	extents->y2 = pixman_fixed_to_int (ty2) + 1;
-
-	return TRUE;
-    }
-}
-
 #define IS_16BIT(x) (((x) >= INT16_MIN) && ((x) <= INT16_MAX))
+#define ABS(f)      (((f) < 0)?  (-(f)) : (f))
+#define IS_16_16(f) (((f) >= pixman_min_fixed_48_16 && ((f) <= pixman_max_fixed_48_16)))
 
 static pixman_bool_t
-analyze_extent (pixman_image_t *image, int x, int y,
-		const pixman_box32_t *extents, uint32_t *flags)
+analyze_extent (pixman_image_t       *image,
+		const pixman_box32_t *extents,
+		uint32_t             *flags)
 {
     pixman_transform_t *transform;
-    pixman_fixed_t *params;
     pixman_fixed_t x_off, y_off;
     pixman_fixed_t width, height;
-    pixman_box32_t ex;
+    pixman_fixed_t *params;
+    box_48_16_t transformed;
+    pixman_box32_t exp_extents;
 
     if (!image)
 	return TRUE;
@@ -600,10 +417,10 @@ analyze_extent (pixman_image_t *image, int x, int y,
      * check here that the expanded-by-one source
      * extents in destination space fits in 16 bits
      */
-    if (!IS_16BIT (extents->x1 - x - 1)		||
-	!IS_16BIT (extents->y1 - y - 1)		||
-	!IS_16BIT (extents->x2 - x + 1)		||
-	!IS_16BIT (extents->y2 - y + 1))
+    if (!IS_16BIT (extents->x1 - 1)		||
+	!IS_16BIT (extents->y1 - 1)		||
+	!IS_16BIT (extents->x2 + 1)		||
+	!IS_16BIT (extents->y2 + 1))
     {
 	return FALSE;
     }
@@ -618,18 +435,16 @@ analyze_extent (pixman_image_t *image, int x, int y,
 	if (image->bits.width >= 0x7fff	|| image->bits.height >= 0x7fff)
 	    return FALSE;
 
-#define ID_AND_NEAREST (FAST_PATH_ID_TRANSFORM | FAST_PATH_NEAREST_FILTER)
-	
-	if ((image->common.flags & ID_AND_NEAREST) == ID_AND_NEAREST &&
-	    extents->x1 - x >= 0 &&
-	    extents->y1 - y >= 0 &&
-	    extents->x2 - x <= image->bits.width &&
-	    extents->y2 - y <= image->bits.height)
+	if ((image->common.flags & FAST_PATH_ID_TRANSFORM) == FAST_PATH_ID_TRANSFORM &&
+	    extents->x1 >= 0 &&
+	    extents->y1 >= 0 &&
+	    extents->x2 <= image->bits.width &&
+	    extents->y2 <= image->bits.height)
 	{
-	    *flags |= FAST_PATH_SAMPLES_COVER_CLIP;
+	    *flags |= FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
 	    return TRUE;
 	}
-    
+
 	switch (image->common.filter)
 	{
 	case PIXMAN_FILTER_CONVOLUTION:
@@ -640,6 +455,14 @@ analyze_extent (pixman_image_t *image, int x, int y,
 	    height = params[1];
 	    break;
 
+	case PIXMAN_FILTER_SEPARABLE_CONVOLUTION:
+	    params = image->common.filter_params;
+	    x_off = - pixman_fixed_e - ((params[0] - pixman_fixed_1) >> 1);
+	    y_off = - pixman_fixed_e - ((params[1] - pixman_fixed_1) >> 1);
+	    width = params[0];
+	    height = params[1];
+	    break;
+	    
 	case PIXMAN_FILTER_GOOD:
 	case PIXMAN_FILTER_BEST:
 	case PIXMAN_FILTER_BILINEAR:
@@ -660,17 +483,6 @@ analyze_extent (pixman_image_t *image, int x, int y,
 	default:
 	    return FALSE;
 	}
-
-	/* Check whether the non-expanded, transformed extent is entirely within
-	 * the source image, and set the FAST_PATH_SAMPLES_COVER_CLIP if it is.
-	 */
-	ex = *extents;
-	if (compute_sample_extents (transform, &ex, x, y, x_off, y_off, width, height) &&
-	    ex.x1 >= 0 && ex.y1 >= 0 &&
-	    ex.x2 <= image->bits.width && ex.y2 <= image->bits.height)
-	{
-	    *flags |= FAST_PATH_SAMPLES_COVER_CLIP;
-	}
     }
     else
     {
@@ -680,18 +492,58 @@ analyze_extent (pixman_image_t *image, int x, int y,
 	height = 0;
     }
 
-    /* Check that the extents expanded by one don't overflow. This ensures that
-     * compositing functions can simply walk the source space using 16.16
-     * variables without worrying about overflow.
-     */
-    ex.x1 = extents->x1 - 1;
-    ex.y1 = extents->y1 - 1;
-    ex.x2 = extents->x2 + 1;
-    ex.y2 = extents->y2 + 1;
-
-    if (!compute_sample_extents (transform, &ex, x, y, x_off, y_off, width, height))
+    if (!compute_transformed_extents (transform, extents, &transformed))
 	return FALSE;
 
+    /* Expand the source area by a tiny bit so account of different rounding that
+     * may happen during sampling. Note that (8 * pixman_fixed_e) is very far from
+     * 0.5 so this won't cause the area computed to be overly pessimistic.
+     */
+    transformed.x1 -= 8 * pixman_fixed_e;
+    transformed.y1 -= 8 * pixman_fixed_e;
+    transformed.x2 += 8 * pixman_fixed_e;
+    transformed.y2 += 8 * pixman_fixed_e;
+
+    if (image->common.type == BITS)
+    {
+	if (pixman_fixed_to_int (transformed.x1) >= 0			&&
+	    pixman_fixed_to_int (transformed.y1) >= 0			&&
+	    pixman_fixed_to_int (transformed.x2) < image->bits.width	&&
+	    pixman_fixed_to_int (transformed.y2) < image->bits.height)
+	{
+	    *flags |= FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
+	}
+
+	if (pixman_fixed_to_int (transformed.x1 - pixman_fixed_1 / 2) >= 0		  &&
+	    pixman_fixed_to_int (transformed.y1 - pixman_fixed_1 / 2) >= 0		  &&
+	    pixman_fixed_to_int (transformed.x2 + pixman_fixed_1 / 2) < image->bits.width &&
+	    pixman_fixed_to_int (transformed.y2 + pixman_fixed_1 / 2) < image->bits.height)
+	{
+	    *flags |= FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR;
+	}
+    }
+
+    /* Check we don't overflow when the destination extents are expanded by one.
+     * This ensures that compositing functions can simply walk the source space
+     * using 16.16 variables without worrying about overflow.
+     */
+    exp_extents = *extents;
+    exp_extents.x1 -= 1;
+    exp_extents.y1 -= 1;
+    exp_extents.x2 += 1;
+    exp_extents.y2 += 1;
+
+    if (!compute_transformed_extents (transform, &exp_extents, &transformed))
+	return FALSE;
+    
+    if (!IS_16_16 (transformed.x1 + x_off - 8 * pixman_fixed_e)	||
+	!IS_16_16 (transformed.y1 + y_off - 8 * pixman_fixed_e)	||
+	!IS_16_16 (transformed.x2 + x_off + 8 * pixman_fixed_e + width)	||
+	!IS_16_16 (transformed.y2 + y_off + 8 * pixman_fixed_e + height))
+    {
+	return FALSE;
+    }
+
     return TRUE;
 }
 
@@ -729,18 +581,13 @@ pixman_image_composite32 (pixman_op_t      op,
                           int32_t          height)
 {
     pixman_format_code_t src_format, mask_format, dest_format;
-    uint32_t src_flags, mask_flags, dest_flags;
     pixman_region32_t region;
-    pixman_box32_t *extents;
-    uint32_t *src_bits;
-    int src_dx, src_dy;
-    uint32_t *mask_bits;
-    int mask_dx, mask_dy;
-    uint32_t *dest_bits;
-    int dest_dx, dest_dy;
-    pixman_bool_t need_workaround;
+    pixman_box32_t extents;
     pixman_implementation_t *imp;
     pixman_composite_func_t func;
+    pixman_composite_info_t info;
+    const pixman_box32_t *pbox;
+    int n;
 
     _pixman_image_validate (src);
     if (mask)
@@ -748,26 +595,27 @@ pixman_image_composite32 (pixman_op_t      op,
     _pixman_image_validate (dest);
 
     src_format = src->common.extended_format_code;
-    src_flags = src->common.flags;
+    info.src_flags = src->common.flags;
 
-    if (mask)
+    if (mask && !(mask->common.flags & FAST_PATH_IS_OPAQUE))
     {
 	mask_format = mask->common.extended_format_code;
-	mask_flags = mask->common.flags;
+	info.mask_flags = mask->common.flags;
     }
     else
     {
 	mask_format = PIXMAN_null;
-	mask_flags = FAST_PATH_IS_OPAQUE;
+	info.mask_flags = FAST_PATH_IS_OPAQUE;
     }
 
     dest_format = dest->common.extended_format_code;
-    dest_flags = dest->common.flags;
+    info.dest_flags = dest->common.flags;
 
     /* Check for pixbufs */
     if ((mask_format == PIXMAN_a8r8g8b8 || mask_format == PIXMAN_a8b8g8r8) &&
 	(src->type == BITS && src->bits.bits == mask->bits.bits)	   &&
 	(src->common.repeat == mask->common.repeat)			   &&
+	(info.src_flags & info.mask_flags & FAST_PATH_ID_TRANSFORM)	   &&
 	(src_x == mask_x && src_y == mask_y))
     {
 	if (src_format == PIXMAN_x8b8g8r8)
@@ -776,89 +624,92 @@ pixman_image_composite32 (pixman_op_t      op,
 	    src_format = mask_format = PIXMAN_rpixbuf;
     }
 
-    /* Check for workaround */
-    need_workaround = (src_flags | mask_flags | dest_flags) & FAST_PATH_NEEDS_WORKAROUND;
-
-    if (need_workaround)
-    {
-	apply_workaround (src, &src_x, &src_y, &src_bits, &src_dx, &src_dy);
-	apply_workaround (mask, &mask_x, &mask_y, &mask_bits, &mask_dx, &mask_dy);
-	apply_workaround (dest, &dest_x, &dest_y, &dest_bits, &dest_dx, &dest_dy);
-    }
-
     pixman_region32_init (&region);
 
-    if (!pixman_compute_composite_region32 (
+    if (!_pixman_compute_composite_region32 (
 	    &region, src, mask, dest,
 	    src_x, src_y, mask_x, mask_y, dest_x, dest_y, width, height))
     {
 	goto out;
     }
 
-    extents = pixman_region32_extents (&region);
+    extents = *pixman_region32_extents (&region);
 
-    if (!analyze_extent (src, dest_x - src_x, dest_y - src_y, extents, &src_flags))
+    extents.x1 -= dest_x - src_x;
+    extents.y1 -= dest_y - src_y;
+    extents.x2 -= dest_x - src_x;
+    extents.y2 -= dest_y - src_y;
+
+    if (!analyze_extent (src, &extents, &info.src_flags))
 	goto out;
 
-    if (!analyze_extent (mask, dest_x - mask_x, dest_y - mask_y, extents, &mask_flags))
+    extents.x1 -= src_x - mask_x;
+    extents.y1 -= src_y - mask_y;
+    extents.x2 -= src_x - mask_x;
+    extents.y2 -= src_y - mask_y;
+
+    if (!analyze_extent (mask, &extents, &info.mask_flags))
 	goto out;
 
-    /* If the clip is within the source samples, and the samples are opaque,
-     * then the source is effectively opaque.
+    /* If the clip is within the source samples, and the samples are
+     * opaque, then the source is effectively opaque.
      */
-#define BOTH (FAST_PATH_SAMPLES_OPAQUE | FAST_PATH_SAMPLES_COVER_CLIP)
+#define NEAREST_OPAQUE	(FAST_PATH_SAMPLES_OPAQUE |			\
+			 FAST_PATH_NEAREST_FILTER |			\
+			 FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
+#define BILINEAR_OPAQUE	(FAST_PATH_SAMPLES_OPAQUE |			\
+			 FAST_PATH_BILINEAR_FILTER |			\
+			 FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR)
+
+    if ((info.src_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE ||
+	(info.src_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE)
+    {
+	info.src_flags |= FAST_PATH_IS_OPAQUE;
+    }
+
+    if ((info.mask_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE ||
+	(info.mask_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE)
+    {
+	info.mask_flags |= FAST_PATH_IS_OPAQUE;
+    }
 
-    if ((src_flags & BOTH) == BOTH)
-	src_flags |= FAST_PATH_IS_OPAQUE;
-    
-    if ((mask_flags & BOTH) == BOTH)
-	mask_flags |= FAST_PATH_IS_OPAQUE;
-    
     /*
      * Check if we can replace our operator by a simpler one
      * if the src or dest are opaque. The output operator should be
      * mathematically equivalent to the source.
      */
-    op = optimize_operator (op, src_flags, mask_flags, dest_flags);
-    if (op == PIXMAN_OP_DST)
-	goto out;
+    info.op = optimize_operator (op, info.src_flags, info.mask_flags, info.dest_flags);
 
-    if (lookup_composite_function (op,
-				   src_format, src_flags,
-				   mask_format, mask_flags,
-				   dest_format, dest_flags,
-				   &imp, &func))
+    _pixman_implementation_lookup_composite (
+	get_implementation (), info.op,
+	src_format, info.src_flags,
+	mask_format, info.mask_flags,
+	dest_format, info.dest_flags,
+	&imp, &func);
+
+    info.src_image = src;
+    info.mask_image = mask;
+    info.dest_image = dest;
+
+    pbox = pixman_region32_rectangles (&region, &n);
+
+    while (n--)
     {
-	const pixman_box32_t *pbox;
-	int n;
+	info.src_x = pbox->x1 + src_x - dest_x;
+	info.src_y = pbox->y1 + src_y - dest_y;
+	info.mask_x = pbox->x1 + mask_x - dest_x;
+	info.mask_y = pbox->y1 + mask_y - dest_y;
+	info.dest_x = pbox->x1;
+	info.dest_y = pbox->y1;
+	info.width = pbox->x2 - pbox->x1;
+	info.height = pbox->y2 - pbox->y1;
 
-	pbox = pixman_region32_rectangles (&region, &n);
-	
-	while (n--)
-	{
-	    func (imp, op,
-		  src, mask, dest,
-		  pbox->x1 + src_x - dest_x,
-		  pbox->y1 + src_y - dest_y,
-		  pbox->x1 + mask_x - dest_x,
-		  pbox->y1 + mask_y - dest_y,
-		  pbox->x1,
-		  pbox->y1,
-		  pbox->x2 - pbox->x1,
-		  pbox->y2 - pbox->y1);
-	    
-	    pbox++;
-	}
+	func (imp, &info);
+
+	pbox++;
     }
 
 out:
-    if (need_workaround)
-    {
-	unapply_workaround (src, src_bits, src_dx, src_dy);
-	unapply_workaround (mask, mask_bits, mask_dx, mask_dy);
-	unapply_workaround (dest, dest_bits, dest_dx, dest_dy);
-    }
-
     pixman_region32_fini (&region);
 }
 
@@ -889,8 +740,8 @@ pixman_blt (uint32_t *src_bits,
             int       dst_bpp,
             int       src_x,
             int       src_y,
-            int       dst_x,
-            int       dst_y,
+            int       dest_x,
+            int       dest_y,
             int       width,
             int       height)
 {
@@ -898,7 +749,7 @@ pixman_blt (uint32_t *src_bits,
 				       src_bits, dst_bits, src_stride, dst_stride,
                                        src_bpp, dst_bpp,
                                        src_x, src_y,
-                                       dst_x, dst_y,
+                                       dest_x, dest_y,
                                        width, height);
 }
 
@@ -910,10 +761,10 @@ pixman_fill (uint32_t *bits,
              int       y,
              int       width,
              int       height,
-             uint32_t xor)
+             uint32_t  filler)
 {
     return _pixman_implementation_fill (
-	get_implementation(), bits, stride, bpp, x, y, width, height, xor);
+	get_implementation(), bits, stride, bpp, x, y, width, height, filler);
 }
 
 static uint32_t
@@ -927,9 +778,9 @@ color_to_uint32 (const pixman_color_t *color)
 }
 
 static pixman_bool_t
-color_to_pixel (pixman_color_t *     color,
-                uint32_t *           pixel,
-                pixman_format_code_t format)
+color_to_pixel (const pixman_color_t *color,
+                uint32_t *            pixel,
+                pixman_format_code_t  format)
 {
     uint32_t c = color_to_uint32 (color);
 
@@ -939,9 +790,12 @@ color_to_pixel (pixman_color_t *     color,
           format == PIXMAN_x8b8g8r8     ||
           format == PIXMAN_b8g8r8a8     ||
           format == PIXMAN_b8g8r8x8     ||
+          format == PIXMAN_r8g8b8a8     ||
+          format == PIXMAN_r8g8b8x8     ||
           format == PIXMAN_r5g6b5       ||
           format == PIXMAN_b5g6r5       ||
-          format == PIXMAN_a8))
+          format == PIXMAN_a8           ||
+          format == PIXMAN_a1))
     {
 	return FALSE;
     }
@@ -960,12 +814,16 @@ color_to_pixel (pixman_color_t *     color,
 	    ((c & 0x0000ff00) <<  8) |
 	    ((c & 0x000000ff) << 24);
     }
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_RGBA)
+	c = ((c & 0xff000000) >> 24) | (c << 8);
 
-    if (format == PIXMAN_a8)
+    if (format == PIXMAN_a1)
+	c = c >> 31;
+    else if (format == PIXMAN_a8)
 	c = c >> 24;
     else if (format == PIXMAN_r5g6b5 ||
              format == PIXMAN_b5g6r5)
-	c = CONVERT_8888_TO_0565 (c);
+	c = convert_8888_to_0565 (c);
 
 #if 0
     printf ("color: %x %x %x %x\n", color->alpha, color->red, color->green, color->blue);
@@ -979,7 +837,7 @@ color_to_pixel (pixman_color_t *     color,
 PIXMAN_EXPORT pixman_bool_t
 pixman_image_fill_rectangles (pixman_op_t                 op,
                               pixman_image_t *            dest,
-                              pixman_color_t *            color,
+			      const pixman_color_t *      color,
                               int                         n_rects,
                               const pixman_rectangle16_t *rects)
 {
@@ -1018,7 +876,7 @@ pixman_image_fill_rectangles (pixman_op_t                 op,
 PIXMAN_EXPORT pixman_bool_t
 pixman_image_fill_boxes (pixman_op_t           op,
                          pixman_image_t *      dest,
-                         pixman_color_t *      color,
+                         const pixman_color_t *color,
                          int                   n_boxes,
                          const pixman_box32_t *boxes)
 {
@@ -1163,11 +1021,14 @@ pixman_format_supported_source (pixman_format_code_t format)
     case PIXMAN_a2r10g10b10:
     case PIXMAN_x2r10g10b10:
     case PIXMAN_a8r8g8b8:
+    case PIXMAN_a8r8g8b8_sRGB:
     case PIXMAN_x8r8g8b8:
     case PIXMAN_a8b8g8r8:
     case PIXMAN_x8b8g8r8:
     case PIXMAN_b8g8r8a8:
     case PIXMAN_b8g8r8x8:
+    case PIXMAN_r8g8b8a8:
+    case PIXMAN_r8g8b8x8:
     case PIXMAN_r8g8b8:
     case PIXMAN_b8g8r8:
     case PIXMAN_r5g6b5:
@@ -1243,7 +1104,7 @@ PIXMAN_EXPORT pixman_bool_t
 pixman_compute_composite_region (pixman_region16_t * region,
                                  pixman_image_t *    src_image,
                                  pixman_image_t *    mask_image,
-                                 pixman_image_t *    dst_image,
+                                 pixman_image_t *    dest_image,
                                  int16_t             src_x,
                                  int16_t             src_y,
                                  int16_t             mask_x,
@@ -1258,8 +1119,8 @@ pixman_compute_composite_region (pixman_region16_t * region,
 
     pixman_region32_init (&r32);
 
-    retval = pixman_compute_composite_region32 (
-	&r32, src_image, mask_image, dst_image,
+    retval = _pixman_compute_composite_region32 (
+	&r32, src_image, mask_image, dest_image,
 	src_x, src_y, mask_x, mask_y, dest_x, dest_y,
 	width, height);
 
diff --git a/programs/develop/libraries/pixman/pixman.h b/programs/develop/libraries/pixman/pixman.h
index cfffa79add..7ff9fb52a1 100644
--- a/programs/develop/libraries/pixman/pixman.h
+++ b/programs/develop/libraries/pixman/pixman.h
@@ -226,6 +226,9 @@ pixman_bool_t pixman_transform_is_inverse       (const struct pixman_transform *
 /*
  * Floating point matrices
  */
+typedef struct pixman_f_transform pixman_f_transform_t;
+typedef struct pixman_f_vector pixman_f_vector_t;
+
 struct pixman_f_vector
 {
     double  v[3];
@@ -289,7 +292,28 @@ typedef enum
     PIXMAN_FILTER_BEST,
     PIXMAN_FILTER_NEAREST,
     PIXMAN_FILTER_BILINEAR,
-    PIXMAN_FILTER_CONVOLUTION
+    PIXMAN_FILTER_CONVOLUTION,
+
+    /* The SEPARABLE_CONVOLUTION filter takes the following parameters:
+     *
+     *         width:           integer given as 16.16 fixpoint number
+     *         height:          integer given as 16.16 fixpoint number
+     *         x_phase_bits:	integer given as 16.16 fixpoint
+     *         y_phase_bits:	integer given as 16.16 fixpoint
+     *         xtables:         (1 << x_phase_bits) tables of size width
+     *         ytables:         (1 << y_phase_bits) tables of size height
+     *
+     * When sampling at (x, y), the location is first rounded to one of
+     * n_x_phases * n_y_phases subpixel positions. These subpixel positions
+     * determine an xtable and a ytable to use.
+     *
+     * Conceptually a width x height matrix is then formed in which each entry
+     * is the product of the corresponding entries in the x and y tables.
+     * This matrix is then aligned with the image pixels such that its center
+     * is as close as possible to the subpixel location chosen earlier. Then
+     * the image is convolved with the matrix and the resulting pixel returned.
+     */
+    PIXMAN_FILTER_SEPARABLE_CONVOLUTION
 } pixman_filter_t;
 
 typedef enum
@@ -466,6 +490,7 @@ pixman_bool_t           pixman_region_equal              (pixman_region16_t *reg
 pixman_bool_t           pixman_region_selfcheck          (pixman_region16_t *region);
 void                    pixman_region_reset              (pixman_region16_t *region,
 							  pixman_box16_t    *box);
+void			pixman_region_clear		 (pixman_region16_t *region);
 /*
  * 32 bit regions
  */
@@ -560,6 +585,7 @@ pixman_bool_t           pixman_region32_equal              (pixman_region32_t *r
 pixman_bool_t           pixman_region32_selfcheck          (pixman_region32_t *region);
 void                    pixman_region32_reset              (pixman_region32_t *region,
 							    pixman_box32_t    *box);
+void			pixman_region32_clear		   (pixman_region32_t *region);
 
 
 /* Copy / Fill / Misc */
@@ -571,8 +597,8 @@ pixman_bool_t pixman_blt                (uint32_t           *src_bits,
 					 int                 dst_bpp,
 					 int                 src_x,
 					 int                 src_y,
-					 int                 dst_x,
-					 int                 dst_y,
+					 int                 dest_x,
+					 int                 dest_y,
 					 int                 width,
 					 int                 height);
 pixman_bool_t pixman_fill               (uint32_t           *bits,
@@ -650,11 +676,14 @@ struct pixman_indexed
 #define PIXMAN_TYPE_YUY2	6
 #define PIXMAN_TYPE_YV12	7
 #define PIXMAN_TYPE_BGRA	8
+#define PIXMAN_TYPE_RGBA	9
+#define PIXMAN_TYPE_ARGB_SRGB	10
 
 #define PIXMAN_FORMAT_COLOR(f)				\
 	(PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_ARGB ||	\
 	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_ABGR ||	\
-	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_BGRA)
+	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_BGRA ||	\
+	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_RGBA)
 
 /* 32bpp formats */
 typedef enum {
@@ -664,12 +693,17 @@ typedef enum {
     PIXMAN_x8b8g8r8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,0,8,8,8),
     PIXMAN_b8g8r8a8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_BGRA,8,8,8,8),
     PIXMAN_b8g8r8x8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_BGRA,0,8,8,8),
+    PIXMAN_r8g8b8a8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_RGBA,8,8,8,8),
+    PIXMAN_r8g8b8x8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_RGBA,0,8,8,8),
     PIXMAN_x14r6g6b6 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,6,6,6),
     PIXMAN_x2r10g10b10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,10,10,10),
     PIXMAN_a2r10g10b10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,2,10,10,10),
     PIXMAN_x2b10g10r10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,0,10,10,10),
     PIXMAN_a2b10g10r10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,2,10,10,10),
 
+/* sRGB formats */
+    PIXMAN_a8r8g8b8_sRGB = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB_SRGB,8,8,8,8),
+
 /* 24bpp formats */
     PIXMAN_r8g8b8 =	 PIXMAN_FORMAT(24,PIXMAN_TYPE_ARGB,0,8,8,8),
     PIXMAN_b8g8r8 =	 PIXMAN_FORMAT(24,PIXMAN_TYPE_ABGR,0,8,8,8),
@@ -727,18 +761,18 @@ pixman_bool_t pixman_format_supported_destination (pixman_format_code_t format);
 pixman_bool_t pixman_format_supported_source      (pixman_format_code_t format);
 
 /* Constructors */
-pixman_image_t *pixman_image_create_solid_fill       (pixman_color_t               *color);
-pixman_image_t *pixman_image_create_linear_gradient  (pixman_point_fixed_t         *p1,
-						      pixman_point_fixed_t         *p2,
+pixman_image_t *pixman_image_create_solid_fill       (const pixman_color_t         *color);
+pixman_image_t *pixman_image_create_linear_gradient  (const pixman_point_fixed_t   *p1,
+						      const pixman_point_fixed_t   *p2,
 						      const pixman_gradient_stop_t *stops,
 						      int                           n_stops);
-pixman_image_t *pixman_image_create_radial_gradient  (pixman_point_fixed_t         *inner,
-						      pixman_point_fixed_t         *outer,
+pixman_image_t *pixman_image_create_radial_gradient  (const pixman_point_fixed_t   *inner,
+						      const pixman_point_fixed_t   *outer,
 						      pixman_fixed_t                inner_radius,
 						      pixman_fixed_t                outer_radius,
 						      const pixman_gradient_stop_t *stops,
 						      int                           n_stops);
-pixman_image_t *pixman_image_create_conical_gradient (pixman_point_fixed_t         *center,
+pixman_image_t *pixman_image_create_conical_gradient (const pixman_point_fixed_t   *center,
 						      pixman_fixed_t                angle,
 						      const pixman_gradient_stop_t *stops,
 						      int                           n_stops);
@@ -747,6 +781,11 @@ pixman_image_t *pixman_image_create_bits             (pixman_format_code_t
 						      int                           height,
 						      uint32_t                     *bits,
 						      int                           rowstride_bytes);
+pixman_image_t *pixman_image_create_bits_no_clear    (pixman_format_code_t format,
+						      int                  width,
+						      int                  height,
+						      uint32_t *           bits,
+						      int                  rowstride_bytes);
 
 /* Destructor */
 pixman_image_t *pixman_image_ref                     (pixman_image_t               *image);
@@ -792,14 +831,41 @@ int             pixman_image_get_height              (pixman_image_t
 int		pixman_image_get_stride              (pixman_image_t               *image); /* in bytes */
 int		pixman_image_get_depth               (pixman_image_t		   *image);
 pixman_format_code_t pixman_image_get_format	     (pixman_image_t		   *image);
+
+typedef enum
+{
+    PIXMAN_KERNEL_IMPULSE,
+    PIXMAN_KERNEL_BOX,
+    PIXMAN_KERNEL_LINEAR,
+    PIXMAN_KERNEL_CUBIC,
+    PIXMAN_KERNEL_GAUSSIAN,
+    PIXMAN_KERNEL_LANCZOS2,
+    PIXMAN_KERNEL_LANCZOS3,
+    PIXMAN_KERNEL_LANCZOS3_STRETCHED       /* Jim Blinn's 'nice' filter */
+} pixman_kernel_t;
+
+/* Create the parameter list for a SEPARABLE_CONVOLUTION filter
+ * with the given kernels and scale parameters.
+ */
+pixman_fixed_t *
+pixman_filter_create_separable_convolution (int             *n_values,
+					    pixman_fixed_t   scale_x,
+					    pixman_fixed_t   scale_y,
+					    pixman_kernel_t  reconstruct_x,
+					    pixman_kernel_t  reconstruct_y,
+					    pixman_kernel_t  sample_x,
+					    pixman_kernel_t  sample_y,
+					    int              subsample_bits_x,
+					    int              subsample_bits_y);
+
 pixman_bool_t	pixman_image_fill_rectangles	     (pixman_op_t		    op,
 						      pixman_image_t		   *image,
-						      pixman_color_t		   *color,
+						      const pixman_color_t	   *color,
 						      int			    n_rects,
 						      const pixman_rectangle16_t   *rects);
 pixman_bool_t   pixman_image_fill_boxes              (pixman_op_t                   op,
                                                       pixman_image_t               *dest,
-                                                      pixman_color_t               *color,
+                                                      const pixman_color_t         *color,
                                                       int                           n_boxes,
                                                       const pixman_box32_t         *boxes);
 
@@ -807,7 +873,7 @@ pixman_bool_t   pixman_image_fill_boxes              (pixman_op_t
 pixman_bool_t pixman_compute_composite_region (pixman_region16_t *region,
 					       pixman_image_t    *src_image,
 					       pixman_image_t    *mask_image,
-					       pixman_image_t    *dst_image,
+					       pixman_image_t    *dest_image,
 					       int16_t            src_x,
 					       int16_t            src_y,
 					       int16_t            mask_x,
@@ -841,19 +907,84 @@ void          pixman_image_composite32        (pixman_op_t        op,
 					       int32_t            width,
 					       int32_t            height);
 
-/* Old X servers rely on out-of-bounds accesses when they are asked
- * to composite with a window as the source. They create a pixman image
- * pointing to some bogus position in memory, but then they set a clip
- * region to the position where the actual bits are.
+/* Executive Summary: This function is a no-op that only exists
+ * for historical reasons.
+ *
+ * There used to be a bug in the X server where it would rely on
+ * out-of-bounds accesses when it was asked to composite with a
+ * window as the source. It would create a pixman image pointing
+ * to some bogus position in memory, but then set a clip region
+ * to the position where the actual bits were.
  *
  * Due to a bug in old versions of pixman, where it would not clip
  * against the image bounds when a clip region was set, this would
- * actually work. So by default we allow certain out-of-bound access
- * to happen unless explicitly disabled.
+ * actually work. So when the pixman bug was fixed, a workaround was
+ * added to allow certain out-of-bound accesses. This function disabled
+ * those workarounds.
  *
- * Fixed X servers should call this function to disable the workaround.
+ * Since 0.21.2, pixman doesn't do these workarounds anymore, so now this
+ * function is a no-op.
  */
-void          pixman_disable_out_of_bounds_workaround (void);
+void pixman_disable_out_of_bounds_workaround (void);
+
+/*
+ * Glyphs
+ */
+typedef struct pixman_glyph_cache_t pixman_glyph_cache_t;
+typedef struct
+{
+    int		x, y;
+    const void *glyph;
+} pixman_glyph_t;
+
+pixman_glyph_cache_t *pixman_glyph_cache_create       (void);
+void                  pixman_glyph_cache_destroy      (pixman_glyph_cache_t *cache);
+void                  pixman_glyph_cache_freeze       (pixman_glyph_cache_t *cache);
+void                  pixman_glyph_cache_thaw         (pixman_glyph_cache_t *cache);
+const void *          pixman_glyph_cache_lookup       (pixman_glyph_cache_t *cache,
+						       void                 *font_key,
+						       void                 *glyph_key);
+const void *          pixman_glyph_cache_insert       (pixman_glyph_cache_t *cache,
+						       void                 *font_key,
+						       void                 *glyph_key,
+						       int		     origin_x,
+						       int                   origin_y,
+						       pixman_image_t       *glyph_image);
+void                  pixman_glyph_cache_remove       (pixman_glyph_cache_t *cache,
+						       void                 *font_key,
+						       void                 *glyph_key);
+void                  pixman_glyph_get_extents        (pixman_glyph_cache_t *cache,
+						       int                   n_glyphs,
+						       pixman_glyph_t       *glyphs,
+						       pixman_box32_t       *extents);
+pixman_format_code_t  pixman_glyph_get_mask_format    (pixman_glyph_cache_t *cache,
+						       int		     n_glyphs,
+						       const pixman_glyph_t *glyphs);
+void                  pixman_composite_glyphs         (pixman_op_t           op,
+						       pixman_image_t       *src,
+						       pixman_image_t       *dest,
+						       pixman_format_code_t  mask_format,
+						       int32_t               src_x,
+						       int32_t               src_y,
+						       int32_t		     mask_x,
+						       int32_t		     mask_y,
+						       int32_t               dest_x,
+						       int32_t               dest_y,
+						       int32_t		     width,
+						       int32_t		     height,
+						       pixman_glyph_cache_t *cache,
+						       int		     n_glyphs,
+						       const pixman_glyph_t *glyphs);
+void                  pixman_composite_glyphs_no_mask (pixman_op_t           op,
+						       pixman_image_t       *src,
+						       pixman_image_t       *dest,
+						       int32_t               src_x,
+						       int32_t               src_y,
+						       int32_t               dest_x,
+						       int32_t               dest_y,
+						       pixman_glyph_cache_t *cache,
+						       int		     n_glyphs,
+						       const pixman_glyph_t *glyphs);
 
 /*
  * Trapezoids
@@ -862,6 +993,7 @@ typedef struct pixman_edge pixman_edge_t;
 typedef struct pixman_trapezoid pixman_trapezoid_t;
 typedef struct pixman_trap pixman_trap_t;
 typedef struct pixman_span_fix pixman_span_fix_t;
+typedef struct pixman_triangle pixman_triangle_t;
 
 /*
  * An edge structure.  This represents a single polygon edge
@@ -889,6 +1021,10 @@ struct pixman_trapezoid
     pixman_line_fixed_t	left, right;
 };
 
+struct pixman_triangle
+{
+    pixman_point_fixed_t p1, p2, p3;
+};
 
 /* whether 't' is a well defined not obviously empty trapezoid */
 #define pixman_trapezoid_valid(t)				   \
@@ -934,7 +1070,7 @@ void           pixman_add_traps            (pixman_image_t            *image,
 					    int16_t                    x_off,
 					    int16_t                    y_off,
 					    int                        ntrap,
-					    pixman_trap_t             *traps);
+					    const pixman_trap_t       *traps);
 void           pixman_add_trapezoids       (pixman_image_t            *image,
 					    int16_t                    x_off,
 					    int                        y_off,
@@ -944,6 +1080,31 @@ void           pixman_rasterize_trapezoid  (pixman_image_t            *image,
 					    const pixman_trapezoid_t  *trap,
 					    int                        x_off,
 					    int                        y_off);
+void          pixman_composite_trapezoids (pixman_op_t		       op,
+					   pixman_image_t *	       src,
+					   pixman_image_t *	       dst,
+					   pixman_format_code_t	       mask_format,
+					   int			       x_src,
+					   int			       y_src,
+					   int			       x_dst,
+					   int			       y_dst,
+					   int			       n_traps,
+					   const pixman_trapezoid_t *  traps);
+void          pixman_composite_triangles (pixman_op_t		       op,
+					  pixman_image_t *	       src,
+					  pixman_image_t *	       dst,
+					  pixman_format_code_t	       mask_format,
+					  int			       x_src,
+					  int			       y_src,
+					  int			       x_dst,
+					  int			       y_dst,
+					  int			       n_tris,
+					  const pixman_triangle_t *    tris);
+void	      pixman_add_triangles       (pixman_image_t              *image,
+					  int32_t	               x_off,
+					  int32_t	               y_off,
+					  int	                       n_tris,
+					  const pixman_triangle_t     *tris);
 
 PIXMAN_END_DECLS