From d605acf341d25a0e9b1a197cd17bff2c5d44e8e6 Mon Sep 17 00:00:00 2001
From: "Sergey Semyonov (Serge)" <ion2@mail.ru>
Date: Tue, 21 Jan 2014 14:20:59 +0000
Subject: [PATCH] intel-2d: sna-2.99.906

git-svn-id: svn://kolibrios.org@4501 a494cfbc-eb01-0410-851d-a64ba20cac60
---
 contrib/sdk/sources/Intel-2D/Makefile         |   10 +-
 contrib/sdk/sources/Intel-2D/intel_driver.h   |    1 +
 contrib/sdk/sources/Intel-2D/intel_list.h     |    2 +-
 contrib/sdk/sources/Intel-2D/sna/brw/brw_wm.c |   16 +-
 .../sdk/sources/Intel-2D/sna/gen3_render.c    | 2479 +++++++++++++++--
 .../sdk/sources/Intel-2D/sna/gen4_common.c    |   64 +
 .../sdk/sources/Intel-2D/sna/gen4_common.h    |   49 +
 .../sdk/sources/Intel-2D/sna/gen4_render.c    |  101 +-
 .../sdk/sources/Intel-2D/sna/gen4_vertex.c    |   87 +-
 .../sdk/sources/Intel-2D/sna/gen4_vertex.h    |    1 +
 .../sdk/sources/Intel-2D/sna/gen5_render.c    |   97 +-
 .../sdk/sources/Intel-2D/sna/gen6_common.c    |   71 +
 .../sdk/sources/Intel-2D/sna/gen6_common.h    |  139 +
 .../sdk/sources/Intel-2D/sna/gen6_render.c    |  194 +-
 .../sdk/sources/Intel-2D/sna/gen7_render.c    |  205 +-
 contrib/sdk/sources/Intel-2D/sna/kgem.c       | 1032 ++++---
 contrib/sdk/sources/Intel-2D/sna/kgem.h       |  169 +-
 contrib/sdk/sources/Intel-2D/sna/sna.c        |   28 +-
 contrib/sdk/sources/Intel-2D/sna/sna.h        |    9 +-
 contrib/sdk/sources/Intel-2D/sna/sna_reg.h    |   32 +-
 contrib/sdk/sources/Intel-2D/sna/sna_render.h |    5 +-
 contrib/sdk/sources/Intel-2D/uxa/uxa.c        |   14 +-
 22 files changed, 3686 insertions(+), 1119 deletions(-)
 create mode 100644 contrib/sdk/sources/Intel-2D/sna/gen4_common.c
 create mode 100644 contrib/sdk/sources/Intel-2D/sna/gen4_common.h
 create mode 100644 contrib/sdk/sources/Intel-2D/sna/gen6_common.c
 create mode 100644 contrib/sdk/sources/Intel-2D/sna/gen6_common.h

diff --git a/contrib/sdk/sources/Intel-2D/Makefile b/contrib/sdk/sources/Intel-2D/Makefile
index 9ebbd8f58e..5f5839aa79 100644
--- a/contrib/sdk/sources/Intel-2D/Makefile
+++ b/contrib/sdk/sources/Intel-2D/Makefile
@@ -13,6 +13,7 @@ STRIP = $(PREFIX)strip
 LDFLAGS:=  -shared -s -nostdlib -T ../newlib/dll.lds --entry _DllStartup --image-base=0 
 PXFLAGS:= --version-script pixlib.ver --output-def $(LIBRARY).orig.def --out-implib $(LIBRARY).dll.a
 SNAFLAGS:= --version-script sna.ver --output-def sna.def
+UXAFLAGS:= --version-script uxa.ver --output-def uxa.def
  
 INCLUDES= -I. -I../libdrm/intel -I../libdrm/include/drm -I./render_program -I../pixman -I../newlib/include
 
@@ -25,7 +26,10 @@ DEFINES:= -DHAS_DEBUG_FULL=0 -DSHOW_BATCH=0 -DDEBUG_DUMP=0
 
 SRC_PIXLIB = 	pixlib.c
 
-SRC_SNA = 	sna/gen3_render.c	\
+SRC_SNA = 				\
+		sna/gen4_common.c	\
+		sna/gen6_common.c	\
+		sna/gen3_render.c	\
 		sna/gen4_render.c	\
 		sna/gen4_vertex.c	\
 		sna/gen5_render.c	\
@@ -70,7 +74,7 @@ endif
 # targets 
 
 all:$(LIBRARY).dll intel-sna.drv
-uxa:$(LIBRARY).dll
+uxa:$(LIBRARY).dll intel-uxa.drv
 ebox:$(LIBRARY).dll
 
 
@@ -86,7 +90,7 @@ intel-sna.drv: $(OBJ_SNA) Makefile
 	mv -f $@ ../../bin
 
 intel-uxa.drv: $(OBJ_UXA) Makefile
-	$(LD) $(LDFLAGS) $(LIBPATH) -o $@ $(OBJ_UXA) $(LIBS)
+	$(LD) $(LDFLAGS) $(UXAFLAGS) $(LIBPATH) -o $@ $(OBJ_UXA) $(LIBS)
 	$(STRIP) $@
 	mv -f $@ ../../bin
  
diff --git a/contrib/sdk/sources/Intel-2D/intel_driver.h b/contrib/sdk/sources/Intel-2D/intel_driver.h
index af74a9a076..b43370a01d 100644
--- a/contrib/sdk/sources/Intel-2D/intel_driver.h
+++ b/contrib/sdk/sources/Intel-2D/intel_driver.h
@@ -118,5 +118,6 @@ struct intel_device_info {
 
 const struct intel_device_info *intel_detect_chipset(struct pci_device *pci);
 
+#define hosted() (0)
 
 #endif /* INTEL_DRIVER_H */
diff --git a/contrib/sdk/sources/Intel-2D/intel_list.h b/contrib/sdk/sources/Intel-2D/intel_list.h
index 38e4d52fd3..42653c9f0a 100644
--- a/contrib/sdk/sources/Intel-2D/intel_list.h
+++ b/contrib/sdk/sources/Intel-2D/intel_list.h
@@ -261,7 +261,7 @@ static inline void list_move_tail(struct list *list, struct list *head)
  * @return True if the list contains one or more elements or False otherwise.
  */
 static inline bool
-list_is_empty(struct list *head)
+list_is_empty(const struct list *head)
 {
     return head->next == head;
 }
diff --git a/contrib/sdk/sources/Intel-2D/sna/brw/brw_wm.c b/contrib/sdk/sources/Intel-2D/sna/brw/brw_wm.c
index 8b73abee00..ccfad0cfe0 100644
--- a/contrib/sdk/sources/Intel-2D/sna/brw/brw_wm.c
+++ b/contrib/sdk/sources/Intel-2D/sna/brw/brw_wm.c
@@ -521,7 +521,7 @@ static void brw_wm_projective_st(struct brw_compile *p, int dw,
 	if (p->gen >= 060) {
 		/* First compute 1/z */
 		brw_PLN(p,
-			brw_message_reg(msg),
+			brw_vec8_grf(30, 0),
 			brw_vec1_grf(uv+1, 0),
 			brw_vec8_grf(2, 0));
 
@@ -532,22 +532,22 @@ static void brw_wm_projective_st(struct brw_compile *p, int dw,
 			brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
 		} else
 			brw_math_invert(p, brw_vec8_grf(30, 0), brw_vec8_grf(30, 0));
+
 		brw_PLN(p,
-			brw_vec8_grf(28, 0),
+			brw_vec8_grf(26, 0),
 			brw_vec1_grf(uv, 0),
 			brw_vec8_grf(2, 0));
-		brw_MUL(p,
-			brw_message_reg(msg),
-			brw_vec8_grf(28, 0),
-			brw_vec8_grf(30, 0));
-		msg += dw/8;
-
 		brw_PLN(p,
 			brw_vec8_grf(28, 0),
 			brw_vec1_grf(uv, 0),
 			brw_vec8_grf(4, 0));
+
 		brw_MUL(p,
 			brw_message_reg(msg),
+			brw_vec8_grf(26, 0),
+			brw_vec8_grf(30, 0));
+		brw_MUL(p,
+			brw_message_reg(msg + dw/8),
 			brw_vec8_grf(28, 0),
 			brw_vec8_grf(30, 0));
 	} else {
diff --git a/contrib/sdk/sources/Intel-2D/sna/gen3_render.c b/contrib/sdk/sources/Intel-2D/sna/gen3_render.c
index bd44f09c23..fe9c160310 100644
--- a/contrib/sdk/sources/Intel-2D/sna/gen3_render.c
+++ b/contrib/sdk/sources/Intel-2D/sna/gen3_render.c
@@ -1459,7 +1459,7 @@ static void gen3_vertex_close(struct sna *sna)
 			sna->render.vertices = sna->render.vertex_data;
 			sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
 			free_bo = bo;
-		} else if (IS_CPU_MAP(bo->map)) {
+		} else if (sna->render.vertices == MAP(bo->map__cpu)) {
 			DBG(("%s: converting CPU map to GTT\n", __FUNCTION__));
 			sna->render.vertices = kgem_bo_map__gtt(&sna->kgem, bo);
 			if (sna->render.vertices == NULL) {
@@ -1657,6 +1657,122 @@ gen3_render_composite_blt(struct sna *sna,
 	op->prim_emit(sna, op, r);
 }
 
+#if 0
+fastcall static void
+gen3_render_composite_box(struct sna *sna,
+			  const struct sna_composite_op *op,
+			  const BoxRec *box)
+{
+	struct sna_composite_rectangles r;
+
+	DBG(("%s: src=+(%d, %d), mask=+(%d, %d), dst=+(%d, %d)\n",
+	     __FUNCTION__,
+	     op->src.offset[0], op->src.offset[1],
+	     op->mask.offset[0], op->mask.offset[1],
+	     op->dst.x, op->dst.y));
+
+	gen3_get_rectangles(sna, op, 1);
+
+	r.dst.x  = box->x1;
+	r.dst.y  = box->y1;
+	r.width  = box->x2 - box->x1;
+	r.height = box->y2 - box->y1;
+	r.src = r.mask = r.dst;
+
+	op->prim_emit(sna, op, &r);
+}
+
+static void
+gen3_render_composite_boxes__blt(struct sna *sna,
+				 const struct sna_composite_op *op,
+				 const BoxRec *box, int nbox)
+{
+	DBG(("%s: nbox=%d, src=+(%d, %d), mask=+(%d, %d), dst=+(%d, %d)\n",
+	     __FUNCTION__, nbox,
+	     op->src.offset[0], op->src.offset[1],
+	     op->mask.offset[0], op->mask.offset[1],
+	     op->dst.x, op->dst.y));
+
+	do {
+		int nbox_this_time;
+
+		nbox_this_time = gen3_get_rectangles(sna, op, nbox);
+		nbox -= nbox_this_time;
+
+		do {
+			struct sna_composite_rectangles r;
+
+			DBG(("  %s: (%d, %d) x (%d, %d)\n", __FUNCTION__,
+			     box->x1, box->y1,
+			     box->x2 - box->x1,
+			     box->y2 - box->y1));
+
+			r.dst.x  = box->x1; r.dst.y  = box->y1;
+			r.width = box->x2 - box->x1;
+			r.height = box->y2 - box->y1;
+			r.src = r.mask = r.dst;
+
+			op->prim_emit(sna, op, &r);
+			box++;
+		} while (--nbox_this_time);
+	} while (nbox);
+}
+
+static void
+gen3_render_composite_boxes(struct sna *sna,
+			    const struct sna_composite_op *op,
+			    const BoxRec *box, int nbox)
+{
+	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
+
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen3_get_rectangles(sna, op, nbox);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+	} while (nbox);
+}
+
+static void
+gen3_render_composite_boxes__thread(struct sna *sna,
+				    const struct sna_composite_op *op,
+				    const BoxRec *box, int nbox)
+{
+	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen3_get_rectangles(sna, op, nbox);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
+}
+#endif
+
 static void
 gen3_render_composite_done(struct sna *sna,
 			   const struct sna_composite_op *op)
@@ -1702,8 +1818,7 @@ gen3_render_reset(struct sna *sna)
 	state->last_floats_per_vertex = 0;
 	state->last_vertex_offset = 0;
 
-	if (sna->render.vbo != NULL &&
-	    !kgem_bo_is_mappable(&sna->kgem, sna->render.vbo)) {
+	if (sna->render.vbo && !kgem_bo_can_map(&sna->kgem, sna->render.vbo)) {
 		DBG(("%s: discarding vbo as next access will stall: %d\n",
 		     __FUNCTION__, sna->render.vbo->presumed_offset));
 		discard_vbo(sna);
@@ -1755,6 +1870,285 @@ static bool gen3_composite_channel_set_format(struct sna_composite_channel *chan
 }
 
 #if 0
+static bool source_is_covered(PicturePtr picture,
+			      int x, int y,
+			      int width, int height)
+{
+	int x1, y1, x2, y2;
+
+	if (picture->repeat && picture->repeatType != RepeatNone)
+		return true;
+
+	if (picture->pDrawable == NULL)
+		return false;
+
+	if (picture->transform) {
+		pixman_box16_t sample;
+
+		sample.x1 = x;
+		sample.y1 = y;
+		sample.x2 = x + width;
+		sample.y2 = y + height;
+
+		pixman_transform_bounds(picture->transform, &sample);
+
+		x1 = sample.x1;
+		x2 = sample.x2;
+		y1 = sample.y1;
+		y2 = sample.y2;
+	} else {
+		x1 = x;
+		y1 = y;
+		x2 = x + width;
+		y2 = y + height;
+	}
+
+	return
+		x1 >= 0 && y1 >= 0 &&
+		x2 <= picture->pDrawable->width &&
+		y2 <= picture->pDrawable->height;
+}
+
+static bool gen3_composite_channel_set_xformat(PicturePtr picture,
+					       struct sna_composite_channel *channel,
+					       int x, int y,
+					       int width, int height)
+{
+	unsigned int i;
+
+	if (PICT_FORMAT_A(picture->format) != 0)
+		return false;
+
+	if (width == 0 || height == 0)
+		return false;
+
+	if (!source_is_covered(picture, x, y, width, height))
+		return false;
+
+	for (i = 0; i < ARRAY_SIZE(gen3_tex_formats); i++) {
+		if (gen3_tex_formats[i].xfmt == picture->format) {
+			channel->card_format = gen3_tex_formats[i].card_fmt;
+			channel->rb_reversed = gen3_tex_formats[i].rb_reversed;
+			channel->alpha_fixup = true;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static int
+gen3_init_solid(struct sna_composite_channel *channel, uint32_t color)
+{
+	channel->u.gen3.mode = color;
+	channel->u.gen3.type = SHADER_CONSTANT;
+	if (color == 0)
+		channel->u.gen3.type = SHADER_ZERO;
+	else if (color == 0xff000000)
+		channel->u.gen3.type = SHADER_BLACK;
+	else if (color == 0xffffffff)
+		channel->u.gen3.type = SHADER_WHITE;
+
+	channel->bo = NULL;
+	channel->is_opaque = (color >> 24) == 0xff;
+	channel->is_affine = 1;
+	channel->alpha_fixup = 0;
+	channel->rb_reversed = 0;
+
+	DBG(("%s: color=%08x, is_opaque=%d, type=%d\n",
+	     __FUNCTION__, color, channel->is_opaque, channel->u.gen3.type));
+
+	/* for consistency */
+	channel->repeat = RepeatNormal;
+	channel->filter = PictFilterNearest;
+	channel->pict_format = PICT_a8r8g8b8;
+	channel->card_format = MAPSURF_32BIT | MT_32BIT_ARGB8888;
+
+	return 1;
+}
+
+static void gen3_composite_channel_convert(struct sna_composite_channel *channel)
+{
+	if (channel->u.gen3.type == SHADER_TEXTURE)
+		channel->repeat = gen3_texture_repeat(channel->repeat);
+	else
+		channel->repeat = gen3_gradient_repeat(channel->repeat);
+
+	channel->filter = gen3_filter(channel->filter);
+	if (channel->card_format == 0)
+		gen3_composite_channel_set_format(channel, channel->pict_format);
+	assert(channel->card_format);
+}
+
+static bool gen3_gradient_setup(struct sna *sna,
+				PicturePtr picture,
+				struct sna_composite_channel *channel,
+				int16_t ox, int16_t oy)
+{
+	int16_t dx, dy;
+
+	if (picture->repeat == 0) {
+		channel->repeat = RepeatNone;
+	} else switch (picture->repeatType) {
+	case RepeatNone:
+	case RepeatNormal:
+	case RepeatPad:
+	case RepeatReflect:
+		channel->repeat = picture->repeatType;
+		break;
+	default:
+		return false;
+	}
+
+	channel->bo =
+		sna_render_get_gradient(sna,
+					(PictGradient *)picture->pSourcePict);
+	if (channel->bo == NULL)
+		return false;
+
+	channel->pict_format = PICT_a8r8g8b8;
+	channel->card_format = MAPSURF_32BIT | MT_32BIT_ARGB8888;
+	channel->filter = PictFilterNearest;
+	channel->is_affine = sna_transform_is_affine(picture->transform);
+	if (sna_transform_is_integer_translation(picture->transform, &dx, &dy)) {
+		DBG(("%s: integer translation (%d, %d), removing\n",
+		     __FUNCTION__, dx, dy));
+		ox += dx;
+		oy += dy;
+		channel->transform = NULL;
+	} else
+		channel->transform = picture->transform;
+	channel->width  = channel->bo->pitch / 4;
+	channel->height = 1;
+	channel->offset[0] = ox;
+	channel->offset[1] = oy;
+	channel->scale[0] = channel->scale[1] = 1;
+	return true;
+}
+
+static int
+gen3_init_linear(struct sna *sna,
+		 PicturePtr picture,
+		 struct sna_composite_op *op,
+		 struct sna_composite_channel *channel,
+		 int ox, int oy)
+{
+	PictLinearGradient *linear =
+		(PictLinearGradient *)picture->pSourcePict;
+	float x0, y0, sf;
+	float dx, dy, offset;
+	int n;
+
+	DBG(("%s: p1=(%f, %f), p2=(%f, %f)\n",
+	     __FUNCTION__,
+	     xFixedToDouble(linear->p1.x), xFixedToDouble(linear->p1.y),
+	     xFixedToDouble(linear->p2.x), xFixedToDouble(linear->p2.y)));
+
+	if (linear->p2.x == linear->p1.x && linear->p2.y == linear->p1.y)
+		return 0;
+
+	dx = xFixedToDouble(linear->p2.x - linear->p1.x);
+	dy = xFixedToDouble(linear->p2.y - linear->p1.y);
+	sf = dx*dx + dy*dy;
+	dx /= sf;
+	dy /= sf;
+
+	x0 = xFixedToDouble(linear->p1.x);
+	y0 = xFixedToDouble(linear->p1.y);
+	offset = dx*x0 + dy*y0;
+
+	n = op->u.gen3.num_constants;
+	channel->u.gen3.constants = FS_C0 + n / 4;
+	op->u.gen3.constants[n++] = dx;
+	op->u.gen3.constants[n++] = dy;
+	op->u.gen3.constants[n++] = -offset;
+	op->u.gen3.constants[n++] = 0;
+
+	if (!gen3_gradient_setup(sna, picture, channel, ox, oy))
+		return -1;
+
+	channel->u.gen3.type = SHADER_LINEAR;
+	op->u.gen3.num_constants = n;
+
+	DBG(("%s: dx=%f, dy=%f, offset=%f, constants=%d\n",
+	     __FUNCTION__, dx, dy, -offset, channel->u.gen3.constants - FS_C0));
+	return 1;
+}
+
+static int
+gen3_init_radial(struct sna *sna,
+		 PicturePtr picture,
+		 struct sna_composite_op *op,
+		 struct sna_composite_channel *channel,
+		 int ox, int oy)
+{
+	PictRadialGradient *radial = (PictRadialGradient *)picture->pSourcePict;
+	double dx, dy, dr, r1;
+	int n;
+
+	dx = xFixedToDouble(radial->c2.x - radial->c1.x);
+	dy = xFixedToDouble(radial->c2.y - radial->c1.y);
+	dr = xFixedToDouble(radial->c2.radius - radial->c1.radius);
+
+	r1 = xFixedToDouble(radial->c1.radius);
+
+	n = op->u.gen3.num_constants;
+	channel->u.gen3.constants = FS_C0 + n / 4;
+	if (radial->c2.x == radial->c1.x && radial->c2.y == radial->c1.y) {
+		if (radial->c2.radius == radial->c1.radius) {
+			channel->u.gen3.type = SHADER_ZERO;
+			return 1;
+		}
+
+		op->u.gen3.constants[n++] = xFixedToDouble(radial->c1.x) / dr;
+		op->u.gen3.constants[n++] = xFixedToDouble(radial->c1.y) / dr;
+		op->u.gen3.constants[n++] = 1. / dr;
+		op->u.gen3.constants[n++] = -r1 / dr;
+
+		channel->u.gen3.mode = RADIAL_ONE;
+	} else {
+		op->u.gen3.constants[n++] = -xFixedToDouble(radial->c1.x);
+		op->u.gen3.constants[n++] = -xFixedToDouble(radial->c1.y);
+		op->u.gen3.constants[n++] = r1;
+		op->u.gen3.constants[n++] = -4 * (dx*dx + dy*dy - dr*dr);
+
+		op->u.gen3.constants[n++] = -2 * dx;
+		op->u.gen3.constants[n++] = -2 * dy;
+		op->u.gen3.constants[n++] = -2 * r1 * dr;
+		op->u.gen3.constants[n++] = 1 / (2 * (dx*dx + dy*dy - dr*dr));
+
+		channel->u.gen3.mode = RADIAL_TWO;
+	}
+
+	if (!gen3_gradient_setup(sna, picture, channel, ox, oy))
+		return -1;
+
+	channel->u.gen3.type = SHADER_RADIAL;
+	op->u.gen3.num_constants = n;
+	return 1;
+}
+
+static bool
+sna_picture_is_clear(PicturePtr picture,
+		     int x, int y, int w, int h,
+		     uint32_t *color)
+{
+	struct sna_pixmap *priv;
+
+	if (!picture->pDrawable)
+		return false;
+
+	priv = sna_pixmap(get_drawable_pixmap(picture->pDrawable));
+	if (priv == NULL || !priv->clear)
+		return false;
+
+	if (!source_is_covered(picture, x, y, w, h))
+		return false;
+
+	*color = priv->clear_color;
+	return true;
+}
+
 static int
 gen3_composite_picture(struct sna *sna,
 		       PicturePtr picture,
@@ -1874,72 +2268,35 @@ gen3_composite_picture(struct sna *sna,
 	return sna_render_pixmap_bo(sna, channel, pixmap,
 				    x, y, w, h, dst_x, dst_y);
 }
-
-static inline bool
-source_use_blt(struct sna *sna, PicturePtr picture)
-{
-	/* If it is a solid, try to use the BLT paths */
-	if (!picture->pDrawable)
-		return picture->pSourcePict->type == SourcePictTypeSolidFill;
-
-	if (picture->pDrawable->width  == 1 &&
-	    picture->pDrawable->height == 1 &&
-	    picture->repeat)
-		return true;
-
-	if (too_large(picture->pDrawable->width, picture->pDrawable->height))
-		return true;
-
-	return !is_gpu(sna, picture->pDrawable, PREFER_GPU_RENDER);
-}
-
-static bool
-try_blt(struct sna *sna,
-	PicturePtr dst,
-	PicturePtr src,
-	int width, int height)
-{
-	if (sna->kgem.mode != KGEM_RENDER) {
-		DBG(("%s: already performing BLT\n", __FUNCTION__));
-		return true;
-	}
-
-	if (too_large(width, height)) {
-		DBG(("%s: operation too large for 3D pipe (%d, %d)\n",
-		     __FUNCTION__, width, height));
-		return true;
-	}
-
-	if (too_large(dst->pDrawable->width, dst->pDrawable->height)) {
-		DBG(("%s: target too large for 3D pipe (%d, %d)\n",
-		     __FUNCTION__,
-		     dst->pDrawable->width, dst->pDrawable->height));
-		return true;
-	}
-
-	/* is the source picture only in cpu memory e.g. a shm pixmap? */
-	return source_use_blt(sna, src);
-}
 #endif
 
 static void
 gen3_align_vertex(struct sna *sna,
 		  const struct sna_composite_op *op)
 {
-	if (op->floats_per_vertex != sna->render_state.gen3.last_floats_per_vertex) {
-		if (sna->render.vertex_size - sna->render.vertex_used < 2*op->floats_per_rect)
-			gen3_vertex_finish(sna);
+	int vertex_index;
 
-		DBG(("aligning vertex: was %d, now %d floats per vertex, %d->%d\n",
-		     sna->render_state.gen3.last_floats_per_vertex,
-		     op->floats_per_vertex,
-		     sna->render.vertex_index,
-		     (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex));
-		sna->render.vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
-		sna->render.vertex_used = sna->render.vertex_index * op->floats_per_vertex;
-		assert(sna->render.vertex_used < sna->render.vertex_size - op->floats_per_rect);
-		sna->render_state.gen3.last_floats_per_vertex = op->floats_per_vertex;
+	if (op->floats_per_vertex == sna->render_state.gen3.last_floats_per_vertex)
+		return;
+
+	DBG(("aligning vertex: was %d, now %d floats per vertex\n",
+	     sna->render_state.gen3.last_floats_per_vertex,
+	     op->floats_per_vertex));
+
+	assert(op->floats_per_rect == 3*op->floats_per_vertex);
+
+	vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
+	if ((int)sna->render.vertex_size - vertex_index * op->floats_per_vertex < 2*op->floats_per_rect) {
+		DBG(("%s: flushing vertex buffer: new index=%d, max=%d\n",
+		     __FUNCTION__, vertex_index, sna->render.vertex_size / op->floats_per_vertex));
+		if (gen3_vertex_finish(sna) < op->floats_per_vertex)
+			kgem_submit(&sna->kgem);
+
+		vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
 	}
+
+	sna->render.vertex_index = vertex_index;
+	sna->render.vertex_used = vertex_index * op->floats_per_vertex;
 }
 
 static inline bool is_constant_ps(uint32_t type)
@@ -2059,6 +2416,58 @@ gen3_composite_fallback(struct sna *sna,
 	return dst_use_cpu(dst_pixmap);
 }
 
+static int
+reuse_source(struct sna *sna,
+	     PicturePtr src, struct sna_composite_channel *sc, int src_x, int src_y,
+	     PicturePtr mask, struct sna_composite_channel *mc, int msk_x, int msk_y)
+{
+	if (src_x != msk_x || src_y != msk_y)
+		return false;
+
+	if (mask == src) {
+		*mc = *sc;
+		if (mc->bo)
+			kgem_bo_reference(mc->bo);
+		return true;
+	}
+
+	if ((src->pDrawable == NULL || mask->pDrawable != src->pDrawable))
+		return false;
+
+	if (sc->is_solid)
+		return false;
+
+	DBG(("%s: mask reuses source drawable\n", __FUNCTION__));
+
+	if (!sna_transform_equal(src->transform, mask->transform))
+		return false;
+
+	if (!sna_picture_alphamap_equal(src, mask))
+		return false;
+
+	if (!gen3_check_repeat(mask))
+		return false;
+
+	if (!gen3_check_filter(mask))
+		return false;
+
+	if (!gen3_check_format(mask))
+		return false;
+
+	DBG(("%s: reusing source channel for mask with a twist\n",
+	     __FUNCTION__));
+
+	*mc = *sc;
+	mc->repeat = gen3_texture_repeat(mask->repeat ? mask->repeatType : RepeatNone);
+	mc->filter = gen3_filter(mask->filter);
+	mc->pict_format = mask->format;
+	gen3_composite_channel_set_format(mc, mask->format);
+	assert(mc->card_format);
+	if (mc->bo)
+		kgem_bo_reference(mc->bo);
+	return true;
+}
+
 static bool
 gen3_render_composite(struct sna *sna,
 		      uint8_t op,
@@ -2083,7 +2492,6 @@ gen3_render_composite(struct sna *sna,
 	 * 3D -> 2D context switch.
 	 */
 	if (mask == NULL &&
-	    try_blt(sna, dst, src, width, height) &&
 	    sna_blt_composite(sna,
 			      op, src, dst,
 			      src_x, src_y,
@@ -2093,7 +2501,7 @@ gen3_render_composite(struct sna *sna,
 		return true;
 
 	if (gen3_composite_fallback(sna, op, src, mask, dst))
-		return false;
+		goto fallback;
 
 	if (need_tiling(sna, width, height))
 		return sna_tiling_composite(op, src, mask, dst,
@@ -2117,7 +2525,7 @@ gen3_render_composite(struct sna *sna,
 		if (!sna_render_composite_redirect(sna, tmp,
 						   dst_x, dst_y, width, height,
 						   op > PictOpSrc || dst->pCompositeClip->data))
-			return false;
+			goto fallback;
 	}
 
 	tmp->u.gen3.num_constants = 0;
@@ -2406,8 +2814,8 @@ gen3_render_composite(struct sna *sna,
 			goto cleanup_mask;
 	}
 
-	gen3_emit_composite_state(sna, tmp);
 	gen3_align_vertex(sna, tmp);
+	gen3_emit_composite_state(sna, tmp);
 	return true;
 
 cleanup_mask:
@@ -2419,8 +2827,1719 @@ cleanup_src:
 cleanup_dst:
 	if (tmp->redirect.real_bo)
 		kgem_bo_destroy(&sna->kgem, tmp->dst.bo);
+fallback:
+	return (mask == NULL &&
+		sna_blt_composite(sna,
+				  op, src, dst,
+				  src_x, src_y,
+				  dst_x, dst_y,
+				  width, height,
+				  tmp, true));
+}
+
+static void
+gen3_emit_composite_spans_vertex(struct sna *sna,
+				 const struct sna_composite_spans_op *op,
+				 int16_t x, int16_t y,
+				 float opacity)
+{
+	gen3_emit_composite_dstcoord(sna, x + op->base.dst.x, y + op->base.dst.y);
+	gen3_emit_composite_texcoord(sna, &op->base.src, x, y);
+	OUT_VERTEX(opacity);
+}
+
+fastcall static void
+gen3_emit_composite_spans_primitive_zero(struct sna *sna,
+					 const struct sna_composite_spans_op *op,
+					 const BoxRec *box,
+					 float opacity)
+{
+	float *v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 6;
+
+	v[0] = op->base.dst.x + box->x2;
+	v[1] = op->base.dst.y + box->y2;
+
+	v[2] = op->base.dst.x + box->x1;
+	v[3] = v[1];
+
+	v[4] = v[2];
+	v[5] = op->base.dst.x + box->y1;
+}
+
+fastcall static void
+gen3_emit_composite_spans_primitive_zero__boxes(const struct sna_composite_spans_op *op,
+						const struct sna_opacity_box *b,
+						int nbox, float *v)
+{
+	do {
+		v[0] = op->base.dst.x + b->box.x2;
+		v[1] = op->base.dst.y + b->box.y2;
+
+		v[2] = op->base.dst.x + b->box.x1;
+		v[3] = v[1];
+
+		v[4] = v[2];
+		v[5] = op->base.dst.x + b->box.y1;
+
+		v += 6;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
+gen3_emit_composite_spans_primitive_zero_no_offset(struct sna *sna,
+						   const struct sna_composite_spans_op *op,
+						   const BoxRec *box,
+						   float opacity)
+{
+	float *v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 6;
+
+	v[0] = box->x2;
+	v[3] = v[1] = box->y2;
+	v[4] = v[2] = box->x1;
+	v[5] = box->y1;
+}
+
+fastcall static void
+gen3_emit_composite_spans_primitive_zero_no_offset__boxes(const struct sna_composite_spans_op *op,
+							  const struct sna_opacity_box *b,
+							  int nbox, float *v)
+{
+	do {
+		v[0] = b->box.x2;
+		v[3] = v[1] = b->box.y2;
+		v[4] = v[2] = b->box.x1;
+		v[5] = b->box.y1;
+
+		b++;
+		v += 6;
+	} while (--nbox);
+}
+
+fastcall static void
+gen3_emit_composite_spans_primitive_constant(struct sna *sna,
+					     const struct sna_composite_spans_op *op,
+					     const BoxRec *box,
+					     float opacity)
+{
+	float *v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 9;
+
+	v[0] = op->base.dst.x + box->x2;
+	v[6] = v[3] = op->base.dst.x + box->x1;
+	v[4] = v[1] = op->base.dst.y + box->y2;
+	v[7] = op->base.dst.y + box->y1;
+	v[8] = v[5] = v[2] = opacity;
+}
+
+fastcall static void
+gen3_emit_composite_spans_primitive_constant__boxes(const struct sna_composite_spans_op *op,
+						    const struct sna_opacity_box *b,
+						    int nbox,
+						    float *v)
+{
+	do {
+		v[0] = op->base.dst.x + b->box.x2;
+		v[6] = v[3] = op->base.dst.x + b->box.x1;
+		v[4] = v[1] = op->base.dst.y + b->box.y2;
+		v[7] = op->base.dst.y + b->box.y1;
+		v[8] = v[5] = v[2] = b->alpha;
+
+		v += 9;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
+gen3_emit_composite_spans_primitive_constant_no_offset(struct sna *sna,
+						       const struct sna_composite_spans_op *op,
+						       const BoxRec *box,
+						       float opacity)
+{
+	float *v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 9;
+
+	v[0] = box->x2;
+	v[6] = v[3] = box->x1;
+	v[4] = v[1] = box->y2;
+	v[7] = box->y1;
+	v[8] = v[5] = v[2] = opacity;
+}
+
+fastcall static void
+gen3_emit_composite_spans_primitive_constant_no_offset__boxes(const struct sna_composite_spans_op *op,
+							      const struct sna_opacity_box *b,
+							      int nbox, float *v)
+{
+	do {
+		v[0] = b->box.x2;
+		v[6] = v[3] = b->box.x1;
+		v[4] = v[1] = b->box.y2;
+		v[7] = b->box.y1;
+		v[8] = v[5] = v[2] = b->alpha;
+
+		v += 9;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
+gen3_emit_composite_spans_primitive_identity_source(struct sna *sna,
+						    const struct sna_composite_spans_op *op,
+						    const BoxRec *box,
+						    float opacity)
+{
+	float *v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 15;
+
+	v[0] = op->base.dst.x + box->x2;
+	v[1] = op->base.dst.y + box->y2;
+	v[2] = (op->base.src.offset[0] + box->x2) * op->base.src.scale[0];
+	v[3] = (op->base.src.offset[1] + box->y2) * op->base.src.scale[1];
+	v[4] = opacity;
+
+	v[5] = op->base.dst.x + box->x1;
+	v[6] = v[1];
+	v[7] = (op->base.src.offset[0] + box->x1) * op->base.src.scale[0];
+	v[8] = v[3];
+	v[9] = opacity;
+
+	v[10] = v[5];
+	v[11] = op->base.dst.y + box->y1;
+	v[12] = v[7];
+	v[13] = (op->base.src.offset[1] + box->y1) * op->base.src.scale[1];
+	v[14] = opacity;
+}
+
+fastcall static void
+gen3_emit_composite_spans_primitive_identity_source__boxes(const struct sna_composite_spans_op *op,
+							   const struct sna_opacity_box *b,
+							   int nbox,
+							   float *v)
+{
+	do {
+		v[0] = op->base.dst.x + b->box.x2;
+		v[1] = op->base.dst.y + b->box.y2;
+		v[2] = (op->base.src.offset[0] + b->box.x2) * op->base.src.scale[0];
+		v[3] = (op->base.src.offset[1] + b->box.y2) * op->base.src.scale[1];
+		v[4] = b->alpha;
+
+		v[5] = op->base.dst.x + b->box.x1;
+		v[6] = v[1];
+		v[7] = (op->base.src.offset[0] + b->box.x1) * op->base.src.scale[0];
+		v[8] = v[3];
+		v[9] = b->alpha;
+
+		v[10] = v[5];
+		v[11] = op->base.dst.y + b->box.y1;
+		v[12] = v[7];
+		v[13] = (op->base.src.offset[1] + b->box.y1) * op->base.src.scale[1];
+		v[14] = b->alpha;
+
+		v += 15;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
+gen3_emit_composite_spans_primitive_affine_source(struct sna *sna,
+						  const struct sna_composite_spans_op *op,
+						  const BoxRec *box,
+						  float opacity)
+{
+	PictTransform *transform = op->base.src.transform;
+	float *v;
+
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 15;
+
+	v[0]  = op->base.dst.x + box->x2;
+	v[6]  = v[1] = op->base.dst.y + box->y2;
+	v[10] = v[5] = op->base.dst.x + box->x1;
+	v[11] = op->base.dst.y + box->y1;
+	v[14] = v[9] = v[4]  = opacity;
+
+	_sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x2,
+				    (int)op->base.src.offset[1] + box->y2,
+				    transform, op->base.src.scale,
+				    &v[2], &v[3]);
+
+	_sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x1,
+				    (int)op->base.src.offset[1] + box->y2,
+				    transform, op->base.src.scale,
+				    &v[7], &v[8]);
+
+	_sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x1,
+				    (int)op->base.src.offset[1] + box->y1,
+				    transform, op->base.src.scale,
+				    &v[12], &v[13]);
+}
+
+fastcall static void
+gen3_emit_composite_spans_primitive_affine_source__boxes(const struct sna_composite_spans_op *op,
+							 const struct sna_opacity_box *b,
+							 int nbox,
+							 float *v)
+{
+	PictTransform *transform = op->base.src.transform;
+
+	do {
+		v[0]  = op->base.dst.x + b->box.x2;
+		v[6]  = v[1] = op->base.dst.y + b->box.y2;
+		v[10] = v[5] = op->base.dst.x + b->box.x1;
+		v[11] = op->base.dst.y + b->box.y1;
+		v[14] = v[9] = v[4]  = b->alpha;
+
+		_sna_get_transformed_scaled((int)op->base.src.offset[0] + b->box.x2,
+					    (int)op->base.src.offset[1] + b->box.y2,
+					    transform, op->base.src.scale,
+					    &v[2], &v[3]);
+
+		_sna_get_transformed_scaled((int)op->base.src.offset[0] + b->box.x1,
+					    (int)op->base.src.offset[1] + b->box.y2,
+					    transform, op->base.src.scale,
+					    &v[7], &v[8]);
+
+		_sna_get_transformed_scaled((int)op->base.src.offset[0] + b->box.x1,
+					    (int)op->base.src.offset[1] + b->box.y1,
+					    transform, op->base.src.scale,
+					    &v[12], &v[13]);
+		v += 15;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
+gen3_emit_composite_spans_primitive_identity_gradient(struct sna *sna,
+						      const struct sna_composite_spans_op *op,
+						      const BoxRec *box,
+						      float opacity)
+{
+	float *v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 15;
+
+	v[0] = op->base.dst.x + box->x2;
+	v[1] = op->base.dst.y + box->y2;
+	v[2] = op->base.src.offset[0] + box->x2;
+	v[3] = op->base.src.offset[1] + box->y2;
+	v[4] = opacity;
+
+	v[5] = op->base.dst.x + box->x1;
+	v[6] = v[1];
+	v[7] = op->base.src.offset[0] + box->x1;
+	v[8] = v[3];
+	v[9] = opacity;
+
+	v[10] = v[5];
+	v[11] = op->base.dst.y + box->y1;
+	v[12] = v[7];
+	v[13] = op->base.src.offset[1] + box->y1;
+	v[14] = opacity;
+}
+
+fastcall static void
+gen3_emit_composite_spans_primitive_identity_gradient__boxes(const struct sna_composite_spans_op *op,
+							     const struct sna_opacity_box *b,
+							     int nbox,
+							     float *v)
+{
+	do {
+		v[0] = op->base.dst.x + b->box.x2;
+		v[1] = op->base.dst.y + b->box.y2;
+		v[2] = op->base.src.offset[0] + b->box.x2;
+		v[3] = op->base.src.offset[1] + b->box.y2;
+		v[4] = b->alpha;
+
+		v[5] = op->base.dst.x + b->box.x1;
+		v[6] = v[1];
+		v[7] = op->base.src.offset[0] + b->box.x1;
+		v[8] = v[3];
+		v[9] = b->alpha;
+
+		v[10] = v[5];
+		v[11] = op->base.dst.y + b->box.y1;
+		v[12] = v[7];
+		v[13] = op->base.src.offset[1] + b->box.y1;
+		v[14] = b->alpha;
+
+		v += 15;
+		b++;
+	} while (--nbox);
+}
+
+#if defined(sse2) && !defined(__x86_64__)
+sse2 fastcall static void
+gen3_emit_composite_spans_primitive_constant__sse2(struct sna *sna,
+						   const struct sna_composite_spans_op *op,
+						   const BoxRec *box,
+						   float opacity)
+{
+	float *v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 9;
+
+	v[0] = op->base.dst.x + box->x2;
+	v[6] = v[3] = op->base.dst.x + box->x1;
+	v[4] = v[1] = op->base.dst.y + box->y2;
+	v[7] = op->base.dst.y + box->y1;
+	v[8] = v[5] = v[2] = opacity;
+}
+
+sse2 fastcall static void
+gen3_emit_composite_spans_primitive_constant__sse2__boxes(const struct sna_composite_spans_op *op,
+							  const struct sna_opacity_box *b,
+							  int nbox,
+							  float *v)
+{
+	do {
+		v[0] = op->base.dst.x + b->box.x2;
+		v[6] = v[3] = op->base.dst.x + b->box.x1;
+		v[4] = v[1] = op->base.dst.y + b->box.y2;
+		v[7] = op->base.dst.y + b->box.y1;
+		v[8] = v[5] = v[2] = b->alpha;
+
+		v += 9;
+		b++;
+	} while (--nbox);
+}
+
+sse2 fastcall static void
+gen3_render_composite_spans_constant_box__sse2(struct sna *sna,
+					       const struct sna_composite_spans_op *op,
+					       const BoxRec *box, float opacity)
+{
+	float *v;
+	DBG(("%s: src=+(%d, %d), opacity=%f, dst=+(%d, %d), box=(%d, %d) x (%d, %d)\n",
+	     __FUNCTION__,
+	     op->base.src.offset[0], op->base.src.offset[1],
+	     opacity,
+	     op->base.dst.x, op->base.dst.y,
+	     box->x1, box->y1,
+	     box->x2 - box->x1,
+	     box->y2 - box->y1));
+
+	gen3_get_rectangles(sna, &op->base, 1);
+
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 9;
+
+	v[0] = box->x2;
+	v[6] = v[3] = box->x1;
+	v[4] = v[1] = box->y2;
+	v[7] = box->y1;
+	v[8] = v[5] = v[2] = opacity;
+}
+
+sse2 fastcall static void
+gen3_render_composite_spans_constant_thread__sse2__boxes(struct sna *sna,
+							 const struct sna_composite_spans_op *op,
+							 const struct sna_opacity_box *box,
+							 int nbox)
+{
+	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
+	     __FUNCTION__, nbox,
+	     op->base.src.offset[0], op->base.src.offset[1],
+	     op->base.dst.x, op->base.dst.y));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen3_get_rectangles(sna, &op->base, nbox);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * 9;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		do {
+			v[0] = box->box.x2;
+			v[6] = v[3] = box->box.x1;
+			v[4] = v[1] = box->box.y2;
+			v[7] = box->box.y1;
+			v[8] = v[5] = v[2] = box->alpha;
+			v += 9;
+			box++;
+		} while (--nbox_this_time);
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
+}
+
+sse2 fastcall static void
+gen3_emit_composite_spans_primitive_constant__sse2__no_offset(struct sna *sna,
+							      const struct sna_composite_spans_op *op,
+							      const BoxRec *box,
+							      float opacity)
+{
+	float *v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 9;
+
+	v[0] = box->x2;
+	v[6] = v[3] = box->x1;
+	v[4] = v[1] = box->y2;
+	v[7] = box->y1;
+	v[8] = v[5] = v[2] = opacity;
+}
+
+sse2 fastcall static void
+gen3_emit_composite_spans_primitive_constant__sse2__no_offset__boxes(const struct sna_composite_spans_op *op,
+								     const struct sna_opacity_box *b,
+								     int nbox, float *v)
+{
+	do {
+		v[0] = b->box.x2;
+		v[6] = v[3] = b->box.x1;
+		v[4] = v[1] = b->box.y2;
+		v[7] = b->box.y1;
+		v[8] = v[5] = v[2] = b->alpha;
+
+		v += 9;
+		b++;
+	} while (--nbox);
+}
+
+sse2 fastcall static void
+gen3_emit_composite_spans_primitive_identity_source__sse2(struct sna *sna,
+							  const struct sna_composite_spans_op *op,
+							  const BoxRec *box,
+							  float opacity)
+{
+	float *v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 15;
+
+	v[0] = op->base.dst.x + box->x2;
+	v[1] = op->base.dst.y + box->y2;
+	v[2] = (op->base.src.offset[0] + box->x2) * op->base.src.scale[0];
+	v[3] = (op->base.src.offset[1] + box->y2) * op->base.src.scale[1];
+	v[4] = opacity;
+
+	v[5] = op->base.dst.x + box->x1;
+	v[6] = v[1];
+	v[7] = (op->base.src.offset[0] + box->x1) * op->base.src.scale[0];
+	v[8] = v[3];
+	v[9] = opacity;
+
+	v[10] = v[5];
+	v[11] = op->base.dst.y + box->y1;
+	v[12] = v[7];
+	v[13] = (op->base.src.offset[1] + box->y1) * op->base.src.scale[1];
+	v[14] = opacity;
+}
+
+sse2 fastcall static void
+gen3_emit_composite_spans_primitive_identity_source__sse2__boxes(const struct sna_composite_spans_op *op,
+								 const struct sna_opacity_box *b,
+								 int nbox,
+								 float *v)
+{
+	do {
+		v[0] = op->base.dst.x + b->box.x2;
+		v[1] = op->base.dst.y + b->box.y2;
+		v[2] = (op->base.src.offset[0] + b->box.x2) * op->base.src.scale[0];
+		v[3] = (op->base.src.offset[1] + b->box.y2) * op->base.src.scale[1];
+		v[4] = b->alpha;
+
+		v[5] = op->base.dst.x + b->box.x1;
+		v[6] = v[1];
+		v[7] = (op->base.src.offset[0] + b->box.x1) * op->base.src.scale[0];
+		v[8] = v[3];
+		v[9] = b->alpha;
+
+		v[10] = v[5];
+		v[11] = op->base.dst.y + b->box.y1;
+		v[12] = v[7];
+		v[13] = (op->base.src.offset[1] + b->box.y1) * op->base.src.scale[1];
+		v[14] = b->alpha;
+
+		v += 15;
+		b++;
+	} while (--nbox);
+}
+sse2 fastcall static void
+gen3_emit_composite_spans_primitive_affine_source__sse2(struct sna *sna,
+							const struct sna_composite_spans_op *op,
+							const BoxRec *box,
+							float opacity)
+{
+	PictTransform *transform = op->base.src.transform;
+	float *v;
+
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 15;
+
+	v[0]  = op->base.dst.x + box->x2;
+	v[6]  = v[1] = op->base.dst.y + box->y2;
+	v[10] = v[5] = op->base.dst.x + box->x1;
+	v[11] = op->base.dst.y + box->y1;
+	v[14] = v[9] = v[4]  = opacity;
+
+	_sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x2,
+				    (int)op->base.src.offset[1] + box->y2,
+				    transform, op->base.src.scale,
+				    &v[2], &v[3]);
+
+	_sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x1,
+				    (int)op->base.src.offset[1] + box->y2,
+				    transform, op->base.src.scale,
+				    &v[7], &v[8]);
+
+	_sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x1,
+				    (int)op->base.src.offset[1] + box->y1,
+				    transform, op->base.src.scale,
+				    &v[12], &v[13]);
+}
+
+sse2 fastcall static void
+gen3_emit_composite_spans_primitive_affine_source__sse2__boxes(const struct sna_composite_spans_op *op,
+							       const struct sna_opacity_box *b,
+							       int nbox,
+							       float *v)
+{
+	PictTransform *transform = op->base.src.transform;
+
+	do {
+		v[0]  = op->base.dst.x + b->box.x2;
+		v[6]  = v[1] = op->base.dst.y + b->box.y2;
+		v[10] = v[5] = op->base.dst.x + b->box.x1;
+		v[11] = op->base.dst.y + b->box.y1;
+		v[14] = v[9] = v[4]  = b->alpha;
+
+		_sna_get_transformed_scaled((int)op->base.src.offset[0] + b->box.x2,
+					    (int)op->base.src.offset[1] + b->box.y2,
+					    transform, op->base.src.scale,
+					    &v[2], &v[3]);
+
+		_sna_get_transformed_scaled((int)op->base.src.offset[0] + b->box.x1,
+					    (int)op->base.src.offset[1] + b->box.y2,
+					    transform, op->base.src.scale,
+					    &v[7], &v[8]);
+
+		_sna_get_transformed_scaled((int)op->base.src.offset[0] + b->box.x1,
+					    (int)op->base.src.offset[1] + b->box.y1,
+					    transform, op->base.src.scale,
+					    &v[12], &v[13]);
+		v += 15;
+		b++;
+	} while (--nbox);
+}
+
+sse2 fastcall static void
+gen3_emit_composite_spans_primitive_identity_gradient__sse2(struct sna *sna,
+							    const struct sna_composite_spans_op *op,
+							    const BoxRec *box,
+							    float opacity)
+{
+	float *v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 15;
+
+	v[0] = op->base.dst.x + box->x2;
+	v[1] = op->base.dst.y + box->y2;
+	v[2] = op->base.src.offset[0] + box->x2;
+	v[3] = op->base.src.offset[1] + box->y2;
+	v[4] = opacity;
+
+	v[5] = op->base.dst.x + box->x1;
+	v[6] = v[1];
+	v[7] = op->base.src.offset[0] + box->x1;
+	v[8] = v[3];
+	v[9] = opacity;
+
+	v[10] = v[5];
+	v[11] = op->base.dst.y + box->y1;
+	v[12] = v[7];
+	v[13] = op->base.src.offset[1] + box->y1;
+	v[14] = opacity;
+}
+
+sse2 fastcall static void
+gen3_emit_composite_spans_primitive_identity_gradient__sse2__boxes(const struct sna_composite_spans_op *op,
+								   const struct sna_opacity_box *b,
+								   int nbox,
+								   float *v)
+{
+	do {
+		v[0] = op->base.dst.x + b->box.x2;
+		v[1] = op->base.dst.y + b->box.y2;
+		v[2] = op->base.src.offset[0] + b->box.x2;
+		v[3] = op->base.src.offset[1] + b->box.y2;
+		v[4] = b->alpha;
+
+		v[5] = op->base.dst.x + b->box.x1;
+		v[6] = v[1];
+		v[7] = op->base.src.offset[0] + b->box.x1;
+		v[8] = v[3];
+		v[9] = b->alpha;
+
+		v[10] = v[5];
+		v[11] = op->base.dst.y + b->box.y1;
+		v[12] = v[7];
+		v[13] = op->base.src.offset[1] + b->box.y1;
+		v[14] = b->alpha;
+
+		v += 15;
+		b++;
+	} while (--nbox);
+}
+
+sse2 fastcall static void
+gen3_emit_composite_spans_primitive_affine_gradient__sse2(struct sna *sna,
+							  const struct sna_composite_spans_op *op,
+							  const BoxRec *box,
+							  float opacity)
+{
+	PictTransform *transform = op->base.src.transform;
+	float *v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 15;
+
+	v[0] = op->base.dst.x + box->x2;
+	v[1] = op->base.dst.y + box->y2;
+	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x2,
+				    op->base.src.offset[1] + box->y2,
+				    transform, op->base.src.scale,
+				    &v[2], &v[3]);
+	v[4] = opacity;
+
+	v[5] = op->base.dst.x + box->x1;
+	v[6] = v[1];
+	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
+				    op->base.src.offset[1] + box->y2,
+				    transform, op->base.src.scale,
+				    &v[7], &v[8]);
+	v[9] = opacity;
+
+	v[10] = v[5];
+	v[11] = op->base.dst.y + box->y1;
+	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
+				    op->base.src.offset[1] + box->y1,
+				    transform, op->base.src.scale,
+				    &v[12], &v[13]);
+	v[14] = opacity;
+}
+
+sse2 fastcall static void
+gen3_emit_composite_spans_primitive_affine_gradient__sse2__boxes(const struct sna_composite_spans_op *op,
+								 const struct sna_opacity_box *b,
+								 int nbox,
+								 float *v)
+{
+	PictTransform *transform = op->base.src.transform;
+
+	do {
+		v[0] = op->base.dst.x + b->box.x2;
+		v[1] = op->base.dst.y + b->box.y2;
+		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x2,
+					    op->base.src.offset[1] + b->box.y2,
+					    transform, op->base.src.scale,
+					    &v[2], &v[3]);
+		v[4] = b->alpha;
+
+		v[5] = op->base.dst.x + b->box.x1;
+		v[6] = v[1];
+		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
+					    op->base.src.offset[1] + b->box.y2,
+					    transform, op->base.src.scale,
+					    &v[7], &v[8]);
+		v[9] = b->alpha;
+
+		v[10] = v[5];
+		v[11] = op->base.dst.y + b->box.y1;
+		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
+					    op->base.src.offset[1] + b->box.y1,
+					    transform, op->base.src.scale,
+					    &v[12], &v[13]);
+		v[14] = b->alpha;
+		v += 15;
+		b++;
+	} while (--nbox);
+}
+#endif
+
+fastcall static void
+gen3_emit_composite_spans_primitive_affine_gradient(struct sna *sna,
+						    const struct sna_composite_spans_op *op,
+						    const BoxRec *box,
+						    float opacity)
+{
+	PictTransform *transform = op->base.src.transform;
+	float *v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 15;
+
+	v[0] = op->base.dst.x + box->x2;
+	v[1] = op->base.dst.y + box->y2;
+	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x2,
+				    op->base.src.offset[1] + box->y2,
+				    transform, op->base.src.scale,
+				    &v[2], &v[3]);
+	v[4] = opacity;
+
+	v[5] = op->base.dst.x + box->x1;
+	v[6] = v[1];
+	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
+				    op->base.src.offset[1] + box->y2,
+				    transform, op->base.src.scale,
+				    &v[7], &v[8]);
+	v[9] = opacity;
+
+	v[10] = v[5];
+	v[11] = op->base.dst.y + box->y1;
+	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
+				    op->base.src.offset[1] + box->y1,
+				    transform, op->base.src.scale,
+				    &v[12], &v[13]);
+	v[14] = opacity;
+}
+
+fastcall static void
+gen3_emit_composite_spans_primitive_affine_gradient__boxes(const struct sna_composite_spans_op *op,
+							   const struct sna_opacity_box *b,
+							   int nbox,
+							   float *v)
+{
+	PictTransform *transform = op->base.src.transform;
+
+	do {
+		v[0] = op->base.dst.x + b->box.x2;
+		v[1] = op->base.dst.y + b->box.y2;
+		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x2,
+					    op->base.src.offset[1] + b->box.y2,
+					    transform, op->base.src.scale,
+					    &v[2], &v[3]);
+		v[4] = b->alpha;
+
+		v[5] = op->base.dst.x + b->box.x1;
+		v[6] = v[1];
+		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
+					    op->base.src.offset[1] + b->box.y2,
+					    transform, op->base.src.scale,
+					    &v[7], &v[8]);
+		v[9] = b->alpha;
+
+		v[10] = v[5];
+		v[11] = op->base.dst.y + b->box.y1;
+		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
+					    op->base.src.offset[1] + b->box.y1,
+					    transform, op->base.src.scale,
+					    &v[12], &v[13]);
+		v[14] = b->alpha;
+		v += 15;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
+gen3_emit_composite_spans_primitive(struct sna *sna,
+				    const struct sna_composite_spans_op *op,
+				    const BoxRec *box,
+				    float opacity)
+{
+	gen3_emit_composite_spans_vertex(sna, op,
+					 box->x2, box->y2,
+					 opacity);
+	gen3_emit_composite_spans_vertex(sna, op,
+					 box->x1, box->y2,
+					 opacity);
+	gen3_emit_composite_spans_vertex(sna, op,
+					 box->x1, box->y1,
+					 opacity);
+}
+
+fastcall static void
+gen3_render_composite_spans_constant_box(struct sna *sna,
+					 const struct sna_composite_spans_op *op,
+					 const BoxRec *box, float opacity)
+{
+	float *v;
+	DBG(("%s: src=+(%d, %d), opacity=%f, dst=+(%d, %d), box=(%d, %d) x (%d, %d)\n",
+	     __FUNCTION__,
+	     op->base.src.offset[0], op->base.src.offset[1],
+	     opacity,
+	     op->base.dst.x, op->base.dst.y,
+	     box->x1, box->y1,
+	     box->x2 - box->x1,
+	     box->y2 - box->y1));
+
+	gen3_get_rectangles(sna, &op->base, 1);
+
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 9;
+
+	v[0] = box->x2;
+	v[6] = v[3] = box->x1;
+	v[4] = v[1] = box->y2;
+	v[7] = box->y1;
+	v[8] = v[5] = v[2] = opacity;
+}
+
+fastcall static void
+gen3_render_composite_spans_constant_thread_boxes(struct sna *sna,
+						  const struct sna_composite_spans_op *op,
+						  const struct sna_opacity_box *box,
+						  int nbox)
+{
+	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
+	     __FUNCTION__, nbox,
+	     op->base.src.offset[0], op->base.src.offset[1],
+	     op->base.dst.x, op->base.dst.y));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen3_get_rectangles(sna, &op->base, nbox);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * 9;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		do {
+			v[0] = box->box.x2;
+			v[6] = v[3] = box->box.x1;
+			v[4] = v[1] = box->box.y2;
+			v[7] = box->box.y1;
+			v[8] = v[5] = v[2] = box->alpha;
+			v += 9;
+			box++;
+		} while (--nbox_this_time);
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
+}
+
+fastcall static void
+gen3_render_composite_spans_box(struct sna *sna,
+				const struct sna_composite_spans_op *op,
+				const BoxRec *box, float opacity)
+{
+	DBG(("%s: src=+(%d, %d), opacity=%f, dst=+(%d, %d), box=(%d, %d) x (%d, %d)\n",
+	     __FUNCTION__,
+	     op->base.src.offset[0], op->base.src.offset[1],
+	     opacity,
+	     op->base.dst.x, op->base.dst.y,
+	     box->x1, box->y1,
+	     box->x2 - box->x1,
+	     box->y2 - box->y1));
+
+	gen3_get_rectangles(sna, &op->base, 1);
+	op->prim_emit(sna, op, box, opacity);
+}
+
+static void
+gen3_render_composite_spans_boxes(struct sna *sna,
+				  const struct sna_composite_spans_op *op,
+				  const BoxRec *box, int nbox,
+				  float opacity)
+{
+	DBG(("%s: nbox=%d, src=+(%d, %d), opacity=%f, dst=+(%d, %d)\n",
+	     __FUNCTION__, nbox,
+	     op->base.src.offset[0], op->base.src.offset[1],
+	     opacity,
+	     op->base.dst.x, op->base.dst.y));
+
+	do {
+		int nbox_this_time;
+
+		nbox_this_time = gen3_get_rectangles(sna, &op->base, nbox);
+		nbox -= nbox_this_time;
+
+		do {
+			DBG(("  %s: (%d, %d) x (%d, %d)\n", __FUNCTION__,
+			     box->x1, box->y1,
+			     box->x2 - box->x1,
+			     box->y2 - box->y1));
+
+			op->prim_emit(sna, op, box++, opacity);
+		} while (--nbox_this_time);
+	} while (nbox);
+}
+
+fastcall static void
+gen3_render_composite_spans_boxes__thread(struct sna *sna,
+					  const struct sna_composite_spans_op *op,
+					  const struct sna_opacity_box *box,
+					  int nbox)
+{
+	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
+	     __FUNCTION__, nbox,
+	     op->base.src.offset[0], op->base.src.offset[1],
+	     op->base.dst.x, op->base.dst.y));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen3_get_rectangles(sna, &op->base, nbox);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->base.floats_per_rect;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
+}
+
+fastcall static void
+gen3_render_composite_spans_done(struct sna *sna,
+				 const struct sna_composite_spans_op *op)
+{
+	if (sna->render.vertex_offset)
+		gen3_vertex_flush(sna);
+
+	DBG(("%s()\n", __FUNCTION__));
+
+	if (op->base.src.bo)
+		kgem_bo_destroy(&sna->kgem, op->base.src.bo);
+
+	sna_render_composite_redirect_done(sna, &op->base);
+}
+
+static bool
+gen3_check_composite_spans(struct sna *sna,
+			   uint8_t op, PicturePtr src, PicturePtr dst,
+			   int16_t width, int16_t height, unsigned flags)
+{
+	if (op >= ARRAY_SIZE(gen3_blend_op))
+		return false;
+
+	if (gen3_composite_fallback(sna, op, src, NULL, dst))
+		return false;
+
+	if (need_tiling(sna, width, height) &&
+	    !is_gpu(sna, dst->pDrawable, PREFER_GPU_SPANS)) {
+		DBG(("%s: fallback, tiled operation not on GPU\n",
+		     __FUNCTION__));
+		return false;
+	}
+
+	return true;
+}
+
+static bool
+gen3_render_composite_spans(struct sna *sna,
+			    uint8_t op,
+			    PicturePtr src,
+			    PicturePtr dst,
+			    int16_t src_x,  int16_t src_y,
+			    int16_t dst_x,  int16_t dst_y,
+			    int16_t width,  int16_t height,
+			    unsigned flags,
+			    struct sna_composite_spans_op *tmp)
+{
+	bool no_offset;
+
+	DBG(("%s(src=(%d, %d), dst=(%d, %d), size=(%d, %d))\n", __FUNCTION__,
+	     src_x, src_y, dst_x, dst_y, width, height));
+
+	assert(gen3_check_composite_spans(sna, op, src, dst, width, height, flags));
+
+	if (need_tiling(sna, width, height)) {
+		DBG(("%s: tiling, operation (%dx%d) too wide for pipeline\n",
+		     __FUNCTION__, width, height));
+		return sna_tiling_composite_spans(op, src, dst,
+						  src_x, src_y, dst_x, dst_y,
+						  width, height, flags, tmp);
+	}
+
+	if (!gen3_composite_set_target(sna, &tmp->base, dst,
+				       dst_x, dst_y, width, height)) {
+		DBG(("%s: unable to set render target\n",
+		     __FUNCTION__));
+		return false;
+	}
+
+	tmp->base.op = op;
+	tmp->base.rb_reversed = gen3_dst_rb_reversed(tmp->base.dst.format);
+	if (too_large(tmp->base.dst.width, tmp->base.dst.height) ||
+	    !gen3_check_pitch_3d(tmp->base.dst.bo)) {
+		if (!sna_render_composite_redirect(sna, &tmp->base,
+						   dst_x, dst_y, width, height,
+						   true))
+			return false;
+	}
+
+	tmp->base.src.u.gen3.type = SHADER_TEXTURE;
+	tmp->base.src.is_affine = true;
+	DBG(("%s: preparing source\n", __FUNCTION__));
+	switch (gen3_composite_picture(sna, src, &tmp->base, &tmp->base.src,
+				       src_x, src_y,
+				       width, height,
+				       dst_x, dst_y,
+				       dst->polyMode == PolyModePrecise)) {
+	case -1:
+		goto cleanup_dst;
+	case 0:
+		tmp->base.src.u.gen3.type = SHADER_ZERO;
+		break;
+	case 1:
+		gen3_composite_channel_convert(&tmp->base.src);
+		break;
+	}
+	DBG(("%s: source type=%d\n", __FUNCTION__, tmp->base.src.u.gen3.type));
+
+	if (tmp->base.src.u.gen3.type != SHADER_ZERO)
+		tmp->base.mask.u.gen3.type = SHADER_OPACITY;
+
+	no_offset = tmp->base.dst.x == 0 && tmp->base.dst.y == 0;
+	tmp->box   = gen3_render_composite_spans_box;
+	tmp->boxes = gen3_render_composite_spans_boxes;
+	tmp->thread_boxes = gen3_render_composite_spans_boxes__thread;
+	tmp->done  = gen3_render_composite_spans_done;
+	tmp->prim_emit = gen3_emit_composite_spans_primitive;
+	switch (tmp->base.src.u.gen3.type) {
+	case SHADER_NONE:
+		assert(0);
+	case SHADER_ZERO:
+		if (no_offset) {
+			tmp->prim_emit = gen3_emit_composite_spans_primitive_zero_no_offset;
+			tmp->emit_boxes = gen3_emit_composite_spans_primitive_zero_no_offset__boxes;
+		} else {
+			tmp->prim_emit = gen3_emit_composite_spans_primitive_zero;
+			tmp->emit_boxes = gen3_emit_composite_spans_primitive_zero__boxes;
+		}
+		break;
+	case SHADER_BLACK:
+	case SHADER_WHITE:
+	case SHADER_CONSTANT:
+		if (no_offset) {
+#if defined(sse2) && !defined(__x86_64__)
+			if (sna->cpu_features & SSE2) {
+				tmp->box = gen3_render_composite_spans_constant_box__sse2;
+				tmp->thread_boxes = gen3_render_composite_spans_constant_thread__sse2__boxes;
+				tmp->prim_emit = gen3_emit_composite_spans_primitive_constant__sse2__no_offset;
+				tmp->emit_boxes = gen3_emit_composite_spans_primitive_constant__sse2__no_offset__boxes;
+			} else
+#endif
+			{
+				tmp->box = gen3_render_composite_spans_constant_box;
+				tmp->thread_boxes = gen3_render_composite_spans_constant_thread_boxes;
+				tmp->prim_emit = gen3_emit_composite_spans_primitive_constant_no_offset;
+				tmp->emit_boxes = gen3_emit_composite_spans_primitive_constant_no_offset__boxes;
+			}
+		} else {
+#if defined(sse2) && !defined(__x86_64__)
+			if (sna->cpu_features & SSE2) {
+				tmp->prim_emit = gen3_emit_composite_spans_primitive_constant__sse2;
+				tmp->emit_boxes = gen3_emit_composite_spans_primitive_constant__sse2__boxes;
+			} else
+#endif
+			{
+				tmp->prim_emit = gen3_emit_composite_spans_primitive_constant;
+				tmp->emit_boxes = gen3_emit_composite_spans_primitive_constant__boxes;
+			}
+		}
+		break;
+	case SHADER_LINEAR:
+	case SHADER_RADIAL:
+		if (tmp->base.src.transform == NULL) {
+#if defined(sse2) && !defined(__x86_64__)
+			if (sna->cpu_features & SSE2) {
+				tmp->prim_emit = gen3_emit_composite_spans_primitive_identity_gradient__sse2;
+				tmp->emit_boxes = gen3_emit_composite_spans_primitive_identity_gradient__sse2__boxes;
+			} else
+#endif
+			{
+				tmp->prim_emit = gen3_emit_composite_spans_primitive_identity_gradient;
+				tmp->emit_boxes = gen3_emit_composite_spans_primitive_identity_gradient__boxes;
+			}
+		} else if (tmp->base.src.is_affine) {
+			tmp->base.src.scale[1] = tmp->base.src.scale[0] = 1. / tmp->base.src.transform->matrix[2][2];
+#if defined(sse2) && !defined(__x86_64__)
+			if (sna->cpu_features & SSE2) {
+				tmp->prim_emit = gen3_emit_composite_spans_primitive_affine_gradient__sse2;
+				tmp->emit_boxes = gen3_emit_composite_spans_primitive_affine_gradient__sse2__boxes;
+			} else
+#endif
+			{
+				tmp->prim_emit = gen3_emit_composite_spans_primitive_affine_gradient;
+				tmp->emit_boxes = gen3_emit_composite_spans_primitive_affine_gradient__boxes;
+			}
+		}
+		break;
+	case SHADER_TEXTURE:
+		if (tmp->base.src.transform == NULL) {
+#if defined(sse2) && !defined(__x86_64__)
+			if (sna->cpu_features & SSE2) {
+				tmp->prim_emit = gen3_emit_composite_spans_primitive_identity_source__sse2;
+				tmp->emit_boxes = gen3_emit_composite_spans_primitive_identity_source__sse2__boxes;
+			} else
+#endif
+			{
+				tmp->prim_emit = gen3_emit_composite_spans_primitive_identity_source;
+				tmp->emit_boxes = gen3_emit_composite_spans_primitive_identity_source__boxes;
+			}
+		} else if (tmp->base.src.is_affine) {
+			tmp->base.src.scale[0] /= tmp->base.src.transform->matrix[2][2];
+			tmp->base.src.scale[1] /= tmp->base.src.transform->matrix[2][2];
+#if defined(sse2) && !defined(__x86_64__)
+			if (sna->cpu_features & SSE2) {
+				tmp->prim_emit = gen3_emit_composite_spans_primitive_affine_source__sse2;
+				tmp->emit_boxes = gen3_emit_composite_spans_primitive_affine_source__sse2__boxes;
+			} else
+#endif
+			{
+				tmp->prim_emit = gen3_emit_composite_spans_primitive_affine_source;
+				tmp->emit_boxes = gen3_emit_composite_spans_primitive_affine_source__boxes;
+			}
+		}
+		break;
+	}
+	if (tmp->emit_boxes == NULL)
+		tmp->thread_boxes = NULL;
+
+	tmp->base.mask.bo = NULL;
+
+	tmp->base.floats_per_vertex = 2;
+	if (!is_constant_ps(tmp->base.src.u.gen3.type))
+		tmp->base.floats_per_vertex += tmp->base.src.is_affine ? 2 : 3;
+	tmp->base.floats_per_vertex +=
+		tmp->base.mask.u.gen3.type == SHADER_OPACITY;
+	tmp->base.floats_per_rect = 3 * tmp->base.floats_per_vertex;
+
+	if (!kgem_check_bo(&sna->kgem,
+			   tmp->base.dst.bo, tmp->base.src.bo,
+			   NULL)) {
+		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem,
+				   tmp->base.dst.bo, tmp->base.src.bo,
+				   NULL))
+			goto cleanup_src;
+	}
+
+	gen3_align_vertex(sna, &tmp->base);
+	gen3_emit_composite_state(sna, &tmp->base);
+	return true;
+
+cleanup_src:
+	if (tmp->base.src.bo)
+		kgem_bo_destroy(&sna->kgem, tmp->base.src.bo);
+cleanup_dst:
+	if (tmp->base.redirect.real_bo)
+		kgem_bo_destroy(&sna->kgem, tmp->base.dst.bo);
 	return false;
 }
+
+static void
+gen3_emit_video_state(struct sna *sna,
+		      struct sna_video *video,
+		      struct sna_video_frame *frame,
+		      PixmapPtr pixmap,
+		      struct kgem_bo *dst_bo,
+		      int width, int height,
+		      bool bilinear)
+{
+	struct gen3_render_state *state = &sna->render_state.gen3;
+	uint32_t id, ms3, rewind;
+
+	gen3_emit_target(sna, dst_bo, width, height,
+			 sna_format_for_depth(pixmap->drawable.depth));
+
+	/* XXX share with composite? Is it worth the effort? */
+	if ((state->last_shader & (1<<31)) == 0) {
+		OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
+			  I1_LOAD_S(1) | I1_LOAD_S(2) | I1_LOAD_S(6) |
+			  2);
+		OUT_BATCH((4 << S1_VERTEX_WIDTH_SHIFT) | (4 << S1_VERTEX_PITCH_SHIFT));
+		OUT_BATCH(S2_TEXCOORD_FMT(0, TEXCOORDFMT_2D) |
+			  S2_TEXCOORD_FMT(1, TEXCOORDFMT_NOT_PRESENT) |
+			  S2_TEXCOORD_FMT(2, TEXCOORDFMT_NOT_PRESENT) |
+			  S2_TEXCOORD_FMT(3, TEXCOORDFMT_NOT_PRESENT) |
+			  S2_TEXCOORD_FMT(4, TEXCOORDFMT_NOT_PRESENT) |
+			  S2_TEXCOORD_FMT(5, TEXCOORDFMT_NOT_PRESENT) |
+			  S2_TEXCOORD_FMT(6, TEXCOORDFMT_NOT_PRESENT) |
+			  S2_TEXCOORD_FMT(7, TEXCOORDFMT_NOT_PRESENT));
+		OUT_BATCH((2 << S6_CBUF_SRC_BLEND_FACT_SHIFT) |
+			  (1 << S6_CBUF_DST_BLEND_FACT_SHIFT) |
+			  S6_COLOR_WRITE_ENABLE);
+
+		state->last_blend = 0;
+		state->floats_per_vertex = 4;
+	}
+
+	if (!is_planar_fourcc(frame->id)) {
+		rewind = sna->kgem.nbatch;
+		OUT_BATCH(_3DSTATE_PIXEL_SHADER_CONSTANTS | 4);
+		OUT_BATCH(0x0000001);	/* constant 0 */
+		/* constant 0: brightness/contrast */
+		OUT_BATCH_F(video->brightness / 128.0);
+		OUT_BATCH_F(video->contrast / 255.0);
+		OUT_BATCH_F(0.0);
+		OUT_BATCH_F(0.0);
+		if (state->last_constants &&
+		    memcmp(&sna->kgem.batch[state->last_constants],
+			   &sna->kgem.batch[rewind],
+			   6*sizeof(uint32_t)) == 0)
+			sna->kgem.nbatch = rewind;
+		else
+			state->last_constants = rewind;
+
+		rewind = sna->kgem.nbatch;
+		OUT_BATCH(_3DSTATE_SAMPLER_STATE | 3);
+		OUT_BATCH(0x00000001);
+		OUT_BATCH(SS2_COLORSPACE_CONVERSION |
+			  (FILTER_LINEAR << SS2_MAG_FILTER_SHIFT) |
+			  (FILTER_LINEAR << SS2_MIN_FILTER_SHIFT));
+		OUT_BATCH((TEXCOORDMODE_CLAMP_EDGE << SS3_TCX_ADDR_MODE_SHIFT) |
+			  (TEXCOORDMODE_CLAMP_EDGE << SS3_TCY_ADDR_MODE_SHIFT) |
+			  (0 << SS3_TEXTUREMAP_INDEX_SHIFT) |
+			  SS3_NORMALIZED_COORDS);
+		OUT_BATCH(0x00000000);
+		if (state->last_sampler &&
+		    memcmp(&sna->kgem.batch[state->last_sampler],
+			   &sna->kgem.batch[rewind],
+			   5*sizeof(uint32_t)) == 0)
+			sna->kgem.nbatch = rewind;
+		else
+			state->last_sampler = rewind;
+
+		OUT_BATCH(_3DSTATE_MAP_STATE | 3);
+		OUT_BATCH(0x00000001);	/* texture map #1 */
+		OUT_BATCH(kgem_add_reloc(&sna->kgem, sna->kgem.nbatch,
+					 frame->bo,
+					 I915_GEM_DOMAIN_SAMPLER << 16,
+					 0));
+
+		ms3 = MAPSURF_422;
+		switch (frame->id) {
+		case FOURCC_YUY2:
+			ms3 |= MT_422_YCRCB_NORMAL;
+			break;
+		case FOURCC_UYVY:
+			ms3 |= MT_422_YCRCB_SWAPY;
+			break;
+		}
+		ms3 |= (frame->height - 1) << MS3_HEIGHT_SHIFT;
+		ms3 |= (frame->width - 1) << MS3_WIDTH_SHIFT;
+		OUT_BATCH(ms3);
+		OUT_BATCH(((frame->pitch[0] / 4) - 1) << MS4_PITCH_SHIFT);
+
+		id = 1<<31 | 1<<1 | !!video->brightness;
+		if (state->last_shader != id) {
+			state->last_shader = id;
+			id = sna->kgem.nbatch++;
+
+			gen3_fs_dcl(FS_S0);
+			gen3_fs_dcl(FS_T0);
+			gen3_fs_texld(FS_OC, FS_S0, FS_T0);
+			if (video->brightness != 0) {
+				gen3_fs_add(FS_OC,
+					    gen3_fs_operand_reg(FS_OC),
+					    gen3_fs_operand(FS_C0, X, X, X, ZERO));
+			}
+
+			sna->kgem.batch[id] =
+				_3DSTATE_PIXEL_SHADER_PROGRAM |
+				(sna->kgem.nbatch - id - 2);
+		}
+	} else {
+		/* For the planar formats, we set up three samplers --
+		 * one for each plane, in a Y8 format.  Because I
+		 * couldn't get the special PLANAR_TO_PACKED
+		 * shader setup to work, I did the manual pixel shader:
+		 *
+		 * y' = y - .0625
+		 * u' = u - .5
+		 * v' = v - .5;
+		 *
+		 * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
+		 * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
+		 * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
+		 *
+		 * register assignment:
+		 * r0 = (y',u',v',0)
+		 * r1 = (y,y,y,y)
+		 * r2 = (u,u,u,u)
+		 * r3 = (v,v,v,v)
+		 * OC = (r,g,b,1)
+		 */
+		rewind = sna->kgem.nbatch;
+		OUT_BATCH(_3DSTATE_PIXEL_SHADER_CONSTANTS | (22 - 2));
+		OUT_BATCH(0x000001f);	/* constants 0-4 */
+		/* constant 0: normalization offsets */
+		OUT_BATCH_F(-0.0625);
+		OUT_BATCH_F(-0.5);
+		OUT_BATCH_F(-0.5);
+		OUT_BATCH_F(0.0);
+		/* constant 1: r coefficients */
+		OUT_BATCH_F(1.1643);
+		OUT_BATCH_F(0.0);
+		OUT_BATCH_F(1.5958);
+		OUT_BATCH_F(0.0);
+		/* constant 2: g coefficients */
+		OUT_BATCH_F(1.1643);
+		OUT_BATCH_F(-0.39173);
+		OUT_BATCH_F(-0.81290);
+		OUT_BATCH_F(0.0);
+		/* constant 3: b coefficients */
+		OUT_BATCH_F(1.1643);
+		OUT_BATCH_F(2.017);
+		OUT_BATCH_F(0.0);
+		OUT_BATCH_F(0.0);
+		/* constant 4: brightness/contrast */
+		OUT_BATCH_F(video->brightness / 128.0);
+		OUT_BATCH_F(video->contrast / 255.0);
+		OUT_BATCH_F(0.0);
+		OUT_BATCH_F(0.0);
+		if (state->last_constants &&
+		    memcmp(&sna->kgem.batch[state->last_constants],
+			   &sna->kgem.batch[rewind],
+			   22*sizeof(uint32_t)) == 0)
+			sna->kgem.nbatch = rewind;
+		else
+			state->last_constants = rewind;
+
+		rewind = sna->kgem.nbatch;
+		OUT_BATCH(_3DSTATE_SAMPLER_STATE | 9);
+		OUT_BATCH(0x00000007);
+		/* sampler 0 */
+		OUT_BATCH((FILTER_LINEAR << SS2_MAG_FILTER_SHIFT) |
+			  (FILTER_LINEAR << SS2_MIN_FILTER_SHIFT));
+		OUT_BATCH((TEXCOORDMODE_CLAMP_EDGE << SS3_TCX_ADDR_MODE_SHIFT) |
+			  (TEXCOORDMODE_CLAMP_EDGE << SS3_TCY_ADDR_MODE_SHIFT) |
+			  (0 << SS3_TEXTUREMAP_INDEX_SHIFT) |
+			  SS3_NORMALIZED_COORDS);
+		OUT_BATCH(0x00000000);
+		/* sampler 1 */
+		OUT_BATCH((FILTER_LINEAR << SS2_MAG_FILTER_SHIFT) |
+			  (FILTER_LINEAR << SS2_MIN_FILTER_SHIFT));
+		OUT_BATCH((TEXCOORDMODE_CLAMP_EDGE << SS3_TCX_ADDR_MODE_SHIFT) |
+			  (TEXCOORDMODE_CLAMP_EDGE << SS3_TCY_ADDR_MODE_SHIFT) |
+			  (1 << SS3_TEXTUREMAP_INDEX_SHIFT) |
+			  SS3_NORMALIZED_COORDS);
+		OUT_BATCH(0x00000000);
+		/* sampler 2 */
+		OUT_BATCH((FILTER_LINEAR << SS2_MAG_FILTER_SHIFT) |
+			  (FILTER_LINEAR << SS2_MIN_FILTER_SHIFT));
+		OUT_BATCH((TEXCOORDMODE_CLAMP_EDGE << SS3_TCX_ADDR_MODE_SHIFT) |
+			  (TEXCOORDMODE_CLAMP_EDGE << SS3_TCY_ADDR_MODE_SHIFT) |
+			  (2 << SS3_TEXTUREMAP_INDEX_SHIFT) |
+			  SS3_NORMALIZED_COORDS);
+		OUT_BATCH(0x00000000);
+		if (state->last_sampler &&
+		    memcmp(&sna->kgem.batch[state->last_sampler],
+			   &sna->kgem.batch[rewind],
+			   11*sizeof(uint32_t)) == 0)
+			sna->kgem.nbatch = rewind;
+		else
+			state->last_sampler = rewind;
+
+		OUT_BATCH(_3DSTATE_MAP_STATE | 9);
+		OUT_BATCH(0x00000007);
+
+		OUT_BATCH(kgem_add_reloc(&sna->kgem, sna->kgem.nbatch,
+					 frame->bo,
+					 I915_GEM_DOMAIN_SAMPLER << 16,
+					 0));
+
+		ms3 = MAPSURF_8BIT | MT_8BIT_I8;
+		ms3 |= (frame->height - 1) << MS3_HEIGHT_SHIFT;
+		ms3 |= (frame->width - 1) << MS3_WIDTH_SHIFT;
+		OUT_BATCH(ms3);
+		/* check to see if Y has special pitch than normal
+		 * double u/v pitch, e.g i915 XvMC hw requires at
+		 * least 1K alignment, so Y pitch might
+		 * be same as U/V's.*/
+		if (frame->pitch[1])
+			OUT_BATCH(((frame->pitch[1] / 4) - 1) << MS4_PITCH_SHIFT);
+		else
+			OUT_BATCH(((frame->pitch[0] * 2 / 4) - 1) << MS4_PITCH_SHIFT);
+
+		OUT_BATCH(kgem_add_reloc(&sna->kgem, sna->kgem.nbatch,
+					 frame->bo,
+					 I915_GEM_DOMAIN_SAMPLER << 16,
+					 frame->UBufOffset));
+
+		ms3 = MAPSURF_8BIT | MT_8BIT_I8;
+		ms3 |= (frame->height / 2 - 1) << MS3_HEIGHT_SHIFT;
+		ms3 |= (frame->width / 2 - 1) << MS3_WIDTH_SHIFT;
+		OUT_BATCH(ms3);
+		OUT_BATCH(((frame->pitch[0] / 4) - 1) << MS4_PITCH_SHIFT);
+
+		OUT_BATCH(kgem_add_reloc(&sna->kgem, sna->kgem.nbatch,
+					 frame->bo,
+					 I915_GEM_DOMAIN_SAMPLER << 16,
+					 frame->VBufOffset));
+
+		ms3 = MAPSURF_8BIT | MT_8BIT_I8;
+		ms3 |= (frame->height / 2 - 1) << MS3_HEIGHT_SHIFT;
+		ms3 |= (frame->width / 2 - 1) << MS3_WIDTH_SHIFT;
+		OUT_BATCH(ms3);
+		OUT_BATCH(((frame->pitch[0] / 4) - 1) << MS4_PITCH_SHIFT);
+
+		id = 1<<31 | 2<<1 | !!video->brightness;
+		if (state->last_shader != id) {
+			state->last_shader = id;
+			id = sna->kgem.nbatch++;
+
+			/* Declare samplers */
+			gen3_fs_dcl(FS_S0);	/* Y */
+			gen3_fs_dcl(FS_S1);	/* U */
+			gen3_fs_dcl(FS_S2);	/* V */
+			gen3_fs_dcl(FS_T0);	/* normalized coords */
+
+			/* Load samplers to temporaries. */
+			gen3_fs_texld(FS_R1, FS_S0, FS_T0);
+			gen3_fs_texld(FS_R2, FS_S1, FS_T0);
+			gen3_fs_texld(FS_R3, FS_S2, FS_T0);
+
+			/* Move the sampled YUV data in R[123] to the first
+			 * 3 channels of R0.
+			 */
+			gen3_fs_mov_masked(FS_R0, MASK_X,
+					   gen3_fs_operand_reg(FS_R1));
+			gen3_fs_mov_masked(FS_R0, MASK_Y,
+					   gen3_fs_operand_reg(FS_R2));
+			gen3_fs_mov_masked(FS_R0, MASK_Z,
+					   gen3_fs_operand_reg(FS_R3));
+
+			/* Normalize the YUV data */
+			gen3_fs_add(FS_R0, gen3_fs_operand_reg(FS_R0),
+				    gen3_fs_operand_reg(FS_C0));
+			/* dot-product the YUV data in R0 by the vectors of
+			 * coefficients for calculating R, G, and B, storing
+			 * the results in the R, G, or B channels of the output
+			 * color.  The OC results are implicitly clamped
+			 * at the end of the program.
+			 */
+			gen3_fs_dp3(FS_OC, MASK_X,
+				    gen3_fs_operand_reg(FS_R0),
+				    gen3_fs_operand_reg(FS_C1));
+			gen3_fs_dp3(FS_OC, MASK_Y,
+				    gen3_fs_operand_reg(FS_R0),
+				    gen3_fs_operand_reg(FS_C2));
+			gen3_fs_dp3(FS_OC, MASK_Z,
+				    gen3_fs_operand_reg(FS_R0),
+				    gen3_fs_operand_reg(FS_C3));
+			/* Set alpha of the output to 1.0, by wiring W to 1
+			 * and not actually using the source.
+			 */
+			gen3_fs_mov_masked(FS_OC, MASK_W,
+					   gen3_fs_operand_one());
+
+			if (video->brightness != 0) {
+				gen3_fs_add(FS_OC,
+					    gen3_fs_operand_reg(FS_OC),
+					    gen3_fs_operand(FS_C4, X, X, X, ZERO));
+			}
+
+			sna->kgem.batch[id] =
+				_3DSTATE_PIXEL_SHADER_PROGRAM |
+				(sna->kgem.nbatch - id - 2);
+		}
+	}
+}
+
+static void
+gen3_video_get_batch(struct sna *sna, struct kgem_bo *bo)
+{
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, bo);
+
+	if (!kgem_check_batch(&sna->kgem, 120) ||
+	    !kgem_check_reloc(&sna->kgem, 4) ||
+	    !kgem_check_exec(&sna->kgem, 2)) {
+		_kgem_submit(&sna->kgem);
+		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
+	}
+
+	if (sna->render_state.gen3.need_invariant)
+		gen3_emit_invariant(sna);
+}
+
+static int
+gen3_get_inline_rectangles(struct sna *sna, int want, int floats_per_vertex)
+{
+	int size = floats_per_vertex * 3;
+	int rem = batch_space(sna) - 1;
+
+	if (size * want > rem)
+		want = rem / size;
+
+	return want;
+}
+
+static bool
+gen3_render_video(struct sna *sna,
+		  struct sna_video *video,
+		  struct sna_video_frame *frame,
+		  RegionPtr dstRegion,
+		  PixmapPtr pixmap)
+{
+	struct sna_pixmap *priv = sna_pixmap(pixmap);
+	BoxPtr pbox = REGION_RECTS(dstRegion);
+	int nbox = REGION_NUM_RECTS(dstRegion);
+	int dst_width = dstRegion->extents.x2 - dstRegion->extents.x1;
+	int dst_height = dstRegion->extents.y2 - dstRegion->extents.y1;
+	int src_width = frame->src.x2 - frame->src.x1;
+	int src_height = frame->src.y2 - frame->src.y1;
+	float src_offset_x, src_offset_y;
+	float src_scale_x, src_scale_y;
+	int pix_xoff, pix_yoff;
+	struct kgem_bo *dst_bo;
+	bool bilinear;
+	int copy = 0;
+
+	DBG(("%s: src:%dx%d (frame:%dx%d) -> dst:%dx%d\n", __FUNCTION__,
+	     src_width, src_height, frame->width, frame->height, dst_width, dst_height));
+
+	dst_bo = priv->gpu_bo;
+	if (dst_bo == NULL)
+		return false;
+
+	bilinear = src_width != dst_width || src_height != dst_height;
+
+	src_scale_x = (float)src_width / dst_width / frame->width;
+	src_offset_x = (float)frame->src.x1 / frame->width - dstRegion->extents.x1 * src_scale_x;
+
+	src_scale_y = (float)src_height / dst_height / frame->height;
+	src_offset_y = (float)frame->src.y1 / frame->height - dstRegion->extents.y1 * src_scale_y;
+	DBG(("%s: src offset (%f, %f), scale (%f, %f)\n",
+	     __FUNCTION__, src_offset_x, src_offset_y, src_scale_x, src_scale_y));
+
+	if (too_large(pixmap->drawable.width, pixmap->drawable.height) ||
+	    !gen3_check_pitch_3d(dst_bo)) {
+		int bpp = pixmap->drawable.bitsPerPixel;
+
+		if (too_large(dst_width, dst_height))
+			return false;
+
+		dst_bo = kgem_create_2d(&sna->kgem,
+					dst_width, dst_height, bpp,
+					kgem_choose_tiling(&sna->kgem,
+							   I915_TILING_X,
+							   dst_width, dst_height, bpp),
+					0);
+		if (!dst_bo)
+			return false;
+
+		pix_xoff = -dstRegion->extents.x1;
+		pix_yoff = -dstRegion->extents.y1;
+		copy = 1;
+	} else {
+		/* Set up the offset for translating from the given region
+		 * (in screen coordinates) to the backing pixmap.
+		 */
+#ifdef COMPOSITE
+		pix_xoff = -pixmap->screen_x + pixmap->drawable.x;
+		pix_yoff = -pixmap->screen_y + pixmap->drawable.y;
+#else
+		pix_xoff = 0;
+		pix_yoff = 0;
+#endif
+
+		dst_width  = pixmap->drawable.width;
+		dst_height = pixmap->drawable.height;
+	}
+
+	gen3_video_get_batch(sna, dst_bo);
+	gen3_emit_video_state(sna, video, frame, pixmap,
+			      dst_bo, dst_width, dst_height, bilinear);
+	do {
+		int nbox_this_time = gen3_get_inline_rectangles(sna, nbox, 4);
+		if (nbox_this_time == 0) {
+			gen3_video_get_batch(sna, dst_bo);
+			gen3_emit_video_state(sna, video, frame, pixmap,
+					      dst_bo, dst_width, dst_height, bilinear);
+			nbox_this_time = gen3_get_inline_rectangles(sna, nbox, 4);
+			assert(nbox_this_time);
+		}
+		nbox -= nbox_this_time;
+
+		OUT_BATCH(PRIM3D_RECTLIST | (12 * nbox_this_time - 1));
+		do {
+			int box_x1 = pbox->x1;
+			int box_y1 = pbox->y1;
+			int box_x2 = pbox->x2;
+			int box_y2 = pbox->y2;
+
+			pbox++;
+
+			DBG(("%s: dst (%d, %d), (%d, %d) + (%d, %d); src (%f, %f), (%f, %f)\n",
+			     __FUNCTION__, box_x1, box_y1, box_x2, box_y2, pix_xoff, pix_yoff,
+			     box_x1 * src_scale_x + src_offset_x,
+			     box_y1 * src_scale_y + src_offset_y,
+			     box_x2 * src_scale_x + src_offset_x,
+			     box_y2 * src_scale_y + src_offset_y));
+
+			/* bottom right */
+			OUT_BATCH_F(box_x2 + pix_xoff);
+			OUT_BATCH_F(box_y2 + pix_yoff);
+			OUT_BATCH_F(box_x2 * src_scale_x + src_offset_x);
+			OUT_BATCH_F(box_y2 * src_scale_y + src_offset_y);
+
+			/* bottom left */
+			OUT_BATCH_F(box_x1 + pix_xoff);
+			OUT_BATCH_F(box_y2 + pix_yoff);
+			OUT_BATCH_F(box_x1 * src_scale_x + src_offset_x);
+			OUT_BATCH_F(box_y2 * src_scale_y + src_offset_y);
+
+			/* top left */
+			OUT_BATCH_F(box_x1 + pix_xoff);
+			OUT_BATCH_F(box_y1 + pix_yoff);
+			OUT_BATCH_F(box_x1 * src_scale_x + src_offset_x);
+			OUT_BATCH_F(box_y1 * src_scale_y + src_offset_y);
+		} while (--nbox_this_time);
+	} while (nbox);
+
+	if (copy) {
+#ifdef COMPOSITE
+		pix_xoff = -pixmap->screen_x + pixmap->drawable.x;
+		pix_yoff = -pixmap->screen_y + pixmap->drawable.y;
+#else
+		pix_xoff = 0;
+		pix_yoff = 0;
+#endif
+		sna_blt_copy_boxes(sna, GXcopy,
+				   dst_bo, -dstRegion->extents.x1, -dstRegion->extents.y1,
+				   priv->gpu_bo, pix_xoff, pix_yoff,
+				   pixmap->drawable.bitsPerPixel,
+				   REGION_RECTS(dstRegion),
+				   REGION_NUM_RECTS(dstRegion));
+
+		kgem_bo_destroy(&sna->kgem, dst_bo);
+	}
+
+	if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
+		if ((pix_xoff | pix_yoff) == 0) {
+			sna_damage_add(&priv->gpu_damage, dstRegion);
+			sna_damage_subtract(&priv->cpu_damage, dstRegion);
+		} else {
+			sna_damage_add_boxes(&priv->gpu_damage,
+					     REGION_RECTS(dstRegion),
+					     REGION_NUM_RECTS(dstRegion),
+					     pix_xoff, pix_yoff);
+			sna_damage_subtract_boxes(&priv->cpu_damage,
+						  REGION_RECTS(dstRegion),
+						  REGION_NUM_RECTS(dstRegion),
+						  pix_xoff, pix_yoff);
+		}
+	}
+
+	return true;
+}
+
 #endif
 
 
@@ -2504,159 +4623,81 @@ cleanup_dst:
 
 
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+#if 0
+static bool
+gen3_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
+		     uint32_t color,
+		     int16_t x1, int16_t y1,
+		     int16_t x2, int16_t y2,
+		     uint8_t alu)
+{
+	struct sna_composite_op tmp;
+
+#if NO_FILL_ONE
+	return gen3_render_fill_one_try_blt(sna, dst, bo, color,
+					    x1, y1, x2, y2, alu);
+#endif
+
+	/* Prefer to use the BLT if already engaged */
+	if (prefer_fill_blt(sna) &&
+	    gen3_render_fill_one_try_blt(sna, dst, bo, color,
+					 x1, y1, x2, y2, alu))
+		return true;
+
+	/* Must use the BLT if we can't RENDER... */
+	if (!(alu == GXcopy || alu == GXclear) ||
+	    too_large(dst->drawable.width, dst->drawable.height) ||
+	    bo->pitch > MAX_3D_PITCH)
+		return gen3_render_fill_one_try_blt(sna, dst, bo, color,
+						    x1, y1, x2, y2, alu);
+
+	if (alu == GXclear)
+		color = 0;
+
+	tmp.op = color == 0 ? PictOpClear : PictOpSrc;
+	tmp.dst.pixmap = dst;
+	tmp.dst.width = dst->drawable.width;
+	tmp.dst.height = dst->drawable.height;
+	tmp.dst.format = sna_format_for_depth(dst->drawable.depth);
+	tmp.dst.bo = bo;
+	tmp.floats_per_vertex = 2;
+	tmp.floats_per_rect = 6;
+	tmp.need_magic_ca_pass = 0;
+	tmp.has_component_alpha = 0;
+	tmp.rb_reversed = 0;
+
+	gen3_init_solid(&tmp.src,
+			sna_rgba_for_color(color, dst->drawable.depth));
+	tmp.mask.bo = NULL;
+	tmp.mask.u.gen3.type = SHADER_NONE;
+	tmp.u.gen3.num_constants = 0;
+
+	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
+		kgem_submit(&sna->kgem);
+
+		if (gen3_render_fill_one_try_blt(sna, dst, bo, color,
+						 x1, y1, x2, y2, alu))
+			return true;
+
+		if (!kgem_check_bo(&sna->kgem, bo, NULL))
+			return false;
+	}
+
+	gen3_align_vertex(sna, &tmp);
+	gen3_emit_composite_state(sna, &tmp);
+	gen3_get_rectangles(sna, &tmp, 1);
+	DBG(("	(%d, %d), (%d, %d): %x\n", x1, y1, x2, y2, color));
+	OUT_VERTEX(x2);
+	OUT_VERTEX(y2);
+	OUT_VERTEX(x1);
+	OUT_VERTEX(y2);
+	OUT_VERTEX(x1);
+	OUT_VERTEX(y1);
+	gen3_vertex_flush(sna);
+
+	return true;
+}
+#endif
 
 static void gen3_render_flush(struct sna *sna)
 {
@@ -2808,7 +4849,7 @@ gen3_blit_tex(struct sna *sna,
 		kgem_submit(&sna->kgem);
 	}
 
-	gen3_emit_composite_state(sna, tmp);
 	gen3_align_vertex(sna, tmp);
+	gen3_emit_composite_state(sna, tmp);
 	return true;
 }
diff --git a/contrib/sdk/sources/Intel-2D/sna/gen4_common.c b/contrib/sdk/sources/Intel-2D/sna/gen4_common.c
new file mode 100644
index 0000000000..f3d36c3bac
--- /dev/null
+++ b/contrib/sdk/sources/Intel-2D/sna/gen4_common.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright © 2011-2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "gen4_common.h"
+#include "gen4_vertex.h"
+
+void gen4_render_flush(struct sna *sna)
+{
+	gen4_vertex_close(sna);
+
+	assert(sna->render.vb_id == 0);
+	assert(sna->render.vertex_offset == 0);
+}
+
+void gen4_render_retire(struct kgem *kgem)
+{
+	struct sna *sna;
+
+	sna = container_of(kgem, struct sna, kgem);
+	if (sna->render.nvertex_reloc == 0 && sna->render.vbo && !kgem_bo_is_busy(sna->render.vbo)) {
+		DBG(("%s: resetting idle vbo\n", __FUNCTION__));
+		sna->render.vertex_used = 0;
+		sna->render.vertex_index = 0;
+	}
+}
+
+void gen4_render_expire(struct kgem *kgem)
+{
+	struct sna *sna;
+
+	sna = container_of(kgem, struct sna, kgem);
+	if (sna->render.vbo && !sna->render.vertex_used) {
+		DBG(("%s: discarding vbo\n", __FUNCTION__));
+		discard_vbo(sna);
+	}
+}
diff --git a/contrib/sdk/sources/Intel-2D/sna/gen4_common.h b/contrib/sdk/sources/Intel-2D/sna/gen4_common.h
new file mode 100644
index 0000000000..de860bb00d
--- /dev/null
+++ b/contrib/sdk/sources/Intel-2D/sna/gen4_common.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright © 2011-2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+
+#ifndef GEN4_COMMON_H
+#define GEN4_COMMON_H
+
+#include "sna.h"
+
+inline static void
+discard_vbo(struct sna *sna)
+{
+	kgem_bo_destroy(&sna->kgem, sna->render.vbo);
+	sna->render.vbo = NULL;
+	sna->render.vertices = sna->render.vertex_data;
+	sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+	sna->render.vertex_used = 0;
+	sna->render.vertex_index = 0;
+}
+
+void gen4_render_flush(struct sna *sna);
+void gen4_render_retire(struct kgem *kgem);
+void gen4_render_expire(struct kgem *kgem);
+
+#endif /* GEN4_COMMON_H */
+
diff --git a/contrib/sdk/sources/Intel-2D/sna/gen4_render.c b/contrib/sdk/sources/Intel-2D/sna/gen4_render.c
index e214c68121..5fd764eea5 100644
--- a/contrib/sdk/sources/Intel-2D/sna/gen4_render.c
+++ b/contrib/sdk/sources/Intel-2D/sna/gen4_render.c
@@ -41,6 +41,7 @@
 //#include "sna_video.h"
 
 #include "brw/brw.h"
+#include "gen4_common.h"
 #include "gen4_render.h"
 #include "gen4_source.h"
 #include "gen4_vertex.h"
@@ -549,9 +550,6 @@ static int gen4_get_rectangles__flush(struct sna *sna,
 	if (!kgem_check_reloc_and_exec(&sna->kgem, 2))
 		return 0;
 
-	if (op->need_magic_ca_pass && sna->render.vbo)
-		return 0;
-
 	if (sna->render.vertex_offset) {
 		gen4_vertex_flush(sna);
 		if (gen4_magic_ca_pass(sna, op))
@@ -747,16 +745,10 @@ gen4_align_vertex(struct sna *sna, const struct sna_composite_op *op)
 {
 	assert(op->floats_per_rect == 3*op->floats_per_vertex);
 	if (op->floats_per_vertex != sna->render_state.gen4.floats_per_vertex) {
-		if (sna->render.vertex_size - sna->render.vertex_used < 2*op->floats_per_rect)
-			gen4_vertex_finish(sna);
-
-		DBG(("aligning vertex: was %d, now %d floats per vertex, %d->%d\n",
+		DBG(("aligning vertex: was %d, now %d floats per vertex\n",
 		     sna->render_state.gen4.floats_per_vertex,
-		     op->floats_per_vertex,
-		     sna->render.vertex_index,
-		     (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex));
-		sna->render.vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
-		sna->render.vertex_used = sna->render.vertex_index * op->floats_per_vertex;
+		     op->floats_per_vertex));
+		gen4_vertex_align(sna, op);
 		sna->render_state.gen4.floats_per_vertex = op->floats_per_vertex;
 	}
 }
@@ -1314,11 +1306,12 @@ gen4_render_video(struct sna *sna,
 
 	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL)) {
 		kgem_submit(&sna->kgem);
-		assert(kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL));
+		if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL))
+			return false;
 	}
 
-	gen4_video_bind_surfaces(sna, &tmp);
 	gen4_align_vertex(sna, &tmp);
+	gen4_video_bind_surfaces(sna, &tmp);
 
 	/* Set up the offset for translating from the given region (in screen
 	 * coordinates) to the backing pixmap.
@@ -1548,33 +1541,6 @@ gen4_composite_set_target(struct sna *sna,
 	return true;
 }
 
-static bool
-try_blt(struct sna *sna,
-	PicturePtr dst, PicturePtr src,
-	int width, int height)
-{
-	if (sna->kgem.mode != KGEM_RENDER) {
-		DBG(("%s: already performing BLT\n", __FUNCTION__));
-		return true;
-	}
-
-	if (too_large(width, height)) {
-		DBG(("%s: operation too large for 3D pipe (%d, %d)\n",
-		     __FUNCTION__, width, height));
-		return true;
-	}
-
-	if (too_large(dst->pDrawable->width, dst->pDrawable->height))
-		return true;
-
-	/* The blitter is much faster for solids */
-	if (sna_picture_is_solid(src, NULL))
-		return true;
-
-	/* is the source picture only in cpu memory e.g. a shm pixmap? */
-	return picture_is_cpu(sna, src);
-}
-
 static bool
 check_gradient(PicturePtr picture, bool precise)
 {
@@ -1803,7 +1769,6 @@ gen4_render_composite(struct sna *sna,
 		return false;
 
 	if (mask == NULL &&
-	    try_blt(sna, dst, src, width, height) &&
 	    sna_blt_composite(sna, op,
 			      src, dst,
 			      src_x, src_y,
@@ -1932,8 +1897,8 @@ gen4_render_composite(struct sna *sna,
 			goto cleanup_mask;
 	}
 
-	gen4_bind_surfaces(sna, tmp);
 	gen4_align_vertex(sna, tmp);
+	gen4_bind_surfaces(sna, tmp);
 	return true;
 
 cleanup_mask:
@@ -1989,51 +1954,6 @@ cleanup_dst:
 
 
 
-
-static void
-gen4_render_flush(struct sna *sna)
-{
-	gen4_vertex_close(sna);
-
-	assert(sna->render.vb_id == 0);
-	assert(sna->render.vertex_offset == 0);
-}
-
-static void
-discard_vbo(struct sna *sna)
-{
-	kgem_bo_destroy(&sna->kgem, sna->render.vbo);
-	sna->render.vbo = NULL;
-	sna->render.vertices = sna->render.vertex_data;
-	sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
-	sna->render.vertex_used = 0;
-	sna->render.vertex_index = 0;
-}
-
-static void
-gen4_render_retire(struct kgem *kgem)
-{
-	struct sna *sna;
-
-	sna = container_of(kgem, struct sna, kgem);
-	if (kgem->nbatch == 0 && sna->render.vbo && !kgem_bo_is_busy(sna->render.vbo)) {
-		DBG(("%s: resetting idle vbo\n", __FUNCTION__));
-		sna->render.vertex_used = 0;
-		sna->render.vertex_index = 0;
-	}
-}
-
-static void
-gen4_render_expire(struct kgem *kgem)
-{
-	struct sna *sna;
-
-	sna = container_of(kgem, struct sna, kgem);
-	if (sna->render.vbo && !sna->render.vertex_used) {
-		DBG(("%s: discarding vbo\n", __FUNCTION__));
-		discard_vbo(sna);
-	}
-}
 
 static void gen4_render_reset(struct sna *sna)
 {
@@ -2047,8 +1967,7 @@ static void gen4_render_reset(struct sna *sna)
 	sna->render_state.gen4.drawrect_limit = -1;
 	sna->render_state.gen4.surface_table = -1;
 
-	if (sna->render.vbo &&
-	    !kgem_bo_is_mappable(&sna->kgem, sna->render.vbo)) {
+	if (sna->render.vbo && !kgem_bo_can_map(&sna->kgem, sna->render.vbo)) {
 		DBG(("%s: discarding unmappable vbo\n", __FUNCTION__));
 		discard_vbo(sna);
 	}
@@ -2407,8 +2326,8 @@ gen4_blit_tex(struct sna *sna,
 		kgem_submit(&sna->kgem);
 	}
 
-	gen4_bind_surfaces(sna, tmp);
 	gen4_align_vertex(sna, tmp);
+	gen4_bind_surfaces(sna, tmp);
 	return true;
 }
 
diff --git a/contrib/sdk/sources/Intel-2D/sna/gen4_vertex.c b/contrib/sdk/sources/Intel-2D/sna/gen4_vertex.c
index 91658a554a..cd6ff65322 100644
--- a/contrib/sdk/sources/Intel-2D/sna/gen4_vertex.c
+++ b/contrib/sdk/sources/Intel-2D/sna/gen4_vertex.c
@@ -38,6 +38,29 @@
 #define sse2
 #endif
 
+void gen4_vertex_align(struct sna *sna, const struct sna_composite_op *op)
+{
+	int vertex_index;
+
+	assert(op->floats_per_rect == 3*op->floats_per_vertex);
+
+	vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
+	if ((int)sna->render.vertex_size - vertex_index * op->floats_per_vertex < 2*op->floats_per_rect) {
+		DBG(("%s: flushing vertex buffer: new index=%d, max=%d\n",
+		     __FUNCTION__, vertex_index, sna->render.vertex_size / op->floats_per_vertex));
+		if (gen4_vertex_finish(sna) < op->floats_per_rect) {
+			kgem_submit(&sna->kgem);
+			_kgem_set_mode(&sna->kgem, KGEM_RENDER);
+		}
+
+		vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
+		assert(vertex_index * op->floats_per_vertex <= sna->render.vertex_size);
+	}
+
+	sna->render.vertex_index = vertex_index;
+	sna->render.vertex_used = vertex_index * op->floats_per_vertex;
+}
+
 void gen4_vertex_flush(struct sna *sna)
 {
     DBG(("%s[%x] = %d\n", __FUNCTION__,
@@ -45,7 +68,9 @@ void gen4_vertex_flush(struct sna *sna)
          sna->render.vertex_index - sna->render.vertex_start));
 
     assert(sna->render.vertex_offset);
+	assert(sna->render.vertex_offset <= sna->kgem.nbatch);
     assert(sna->render.vertex_index > sna->render.vertex_start);
+	assert(sna->render.vertex_used <= sna->render.vertex_size);
 
     sna->kgem.batch[sna->render.vertex_offset] =
         sna->render.vertex_index - sna->render.vertex_start;
@@ -62,11 +87,14 @@ int gen4_vertex_finish(struct sna *sna)
          sna->render.vertex_used, sna->render.vertex_size));
     assert(sna->render.vertex_offset == 0);
     assert(sna->render.vertex_used);
+	assert(sna->render.vertex_used <= sna->render.vertex_size);
 
 	sna_vertex_wait__locked(&sna->render);
 
     /* Note: we only need dword alignment (currently) */
 
+	hint = CREATE_GTT_MAP;
+
     bo = sna->render.vbo;
     if (bo) {
         for (i = 0; i < sna->render.nvertex_reloc; i++) {
@@ -88,11 +116,15 @@ int gen4_vertex_finish(struct sna *sna)
         sna->render.vb_id = 0;
 
         kgem_bo_destroy(&sna->kgem, bo);
+		hint |= CREATE_CACHED | CREATE_NO_THROTTLE;
+	} else {
+		if (kgem_is_idle(&sna->kgem)) {
+			sna->render.vertices = sna->render.vertex_data;
+			sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+			return 0;
+		}
     }
 
-    hint = CREATE_GTT_MAP;
-    if (bo)
-        hint |= CREATE_CACHED | CREATE_NO_THROTTLE;
 
     size = 256*1024;
     assert(!sna->render.active);
@@ -163,7 +195,7 @@ void gen4_vertex_close(struct sna *sna)
             sna->render.vertices = sna->render.vertex_data;
             sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
             free_bo = bo;
-        } else if (IS_CPU_MAP(bo->map) && !sna->kgem.has_llc) {
+		} else if (!sna->kgem.has_llc && sna->render.vertices == MAP(bo->map__cpu)) {
             DBG(("%s: converting CPU map to GTT\n", __FUNCTION__));
             sna->render.vertices =
                 kgem_bo_map__gtt(&sna->kgem, sna->render.vbo);
@@ -176,9 +208,16 @@ void gen4_vertex_close(struct sna *sna)
 
         }
     } else {
-        if (sna->kgem.nbatch + sna->render.vertex_used <= sna->kgem.surface) {
+		int size;
+
+		size  = sna->kgem.nbatch;
+		size += sna->kgem.batch_size - sna->kgem.surface;
+		size += sna->render.vertex_used;
+
+		if (size <= 1024) {
             DBG(("%s: copy to batch: %d @ %d\n", __FUNCTION__,
                  sna->render.vertex_used, sna->kgem.nbatch));
+			assert(sna->kgem.nbatch + sna->render.vertex_used <= sna->kgem.surface);
             memcpy(sna->kgem.batch + sna->kgem.nbatch,
                    sna->render.vertex_data,
                    sna->render.vertex_used * 4);
@@ -186,6 +225,37 @@ void gen4_vertex_close(struct sna *sna)
             bo = NULL;
             sna->kgem.nbatch += sna->render.vertex_used;
         } else {
+			size = 256 * 1024;
+			do {
+				bo = kgem_create_linear(&sna->kgem, size,
+							CREATE_GTT_MAP | CREATE_NO_RETIRE | CREATE_NO_THROTTLE | CREATE_CACHED);
+			} while (bo == NULL && (size>>=1) > sizeof(float)*sna->render.vertex_used);
+
+			sna->render.vertices = NULL;
+			if (bo)
+				sna->render.vertices = kgem_bo_map(&sna->kgem, bo);
+			if (sna->render.vertices != NULL) {
+				DBG(("%s: new vbo: %d / %d\n", __FUNCTION__,
+				     sna->render.vertex_used, __kgem_bo_size(bo)/4));
+
+				assert(sizeof(float)*sna->render.vertex_used <= __kgem_bo_size(bo));
+				memcpy(sna->render.vertices,
+				       sna->render.vertex_data,
+				       sizeof(float)*sna->render.vertex_used);
+
+				size = __kgem_bo_size(bo)/4;
+				if (size >= UINT16_MAX)
+					size = UINT16_MAX - 1;
+
+				sna->render.vbo = bo;
+				sna->render.vertex_size = size;
+			} else {
+				DBG(("%s: tmp vbo: %d\n", __FUNCTION__,
+				     sna->render.vertex_used));
+
+				if (bo)
+					kgem_bo_destroy(&sna->kgem, bo);
+
             bo = kgem_create_linear(&sna->kgem,
                         4*sna->render.vertex_used,
                         CREATE_NO_THROTTLE);
@@ -195,11 +265,14 @@ void gen4_vertex_close(struct sna *sna)
                 kgem_bo_destroy(&sna->kgem, bo);
                 bo = NULL;
             }
-            DBG(("%s: new vbo: %d\n", __FUNCTION__,
-                 sna->render.vertex_used));
+
+				assert(sna->render.vbo == NULL);
+				sna->render.vertices = sna->render.vertex_data;
+				sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
             free_bo = bo;
         }
     }
+	}
 
     assert(sna->render.nvertex_reloc);
     for (i = 0; i < sna->render.nvertex_reloc; i++) {
diff --git a/contrib/sdk/sources/Intel-2D/sna/gen4_vertex.h b/contrib/sdk/sources/Intel-2D/sna/gen4_vertex.h
index 31c81d684d..6eb1cc66d5 100644
--- a/contrib/sdk/sources/Intel-2D/sna/gen4_vertex.h
+++ b/contrib/sdk/sources/Intel-2D/sna/gen4_vertex.h
@@ -6,6 +6,7 @@
 #include "sna.h"
 #include "sna_render.h"
 
+void gen4_vertex_align(struct sna *sna, const struct sna_composite_op *op);
 void gen4_vertex_flush(struct sna *sna);
 int gen4_vertex_finish(struct sna *sna);
 void gen4_vertex_close(struct sna *sna);
diff --git a/contrib/sdk/sources/Intel-2D/sna/gen5_render.c b/contrib/sdk/sources/Intel-2D/sna/gen5_render.c
index 43c2226a1c..16351a0914 100644
--- a/contrib/sdk/sources/Intel-2D/sna/gen5_render.c
+++ b/contrib/sdk/sources/Intel-2D/sna/gen5_render.c
@@ -42,6 +42,7 @@
 
 #include "brw/brw.h"
 #include "gen5_render.h"
+#include "gen4_common.h"
 #include "gen4_source.h"
 #include "gen4_vertex.h"
 
@@ -719,16 +720,10 @@ gen5_align_vertex(struct sna *sna, const struct sna_composite_op *op)
 {
 	assert(op->floats_per_rect == 3*op->floats_per_vertex);
 	if (op->floats_per_vertex != sna->render_state.gen5.floats_per_vertex) {
-		if (sna->render.vertex_size - sna->render.vertex_used < 2*op->floats_per_rect)
-			gen4_vertex_finish(sna);
-
-		DBG(("aligning vertex: was %d, now %d floats per vertex, %d->%d\n",
+		DBG(("aligning vertex: was %d, now %d floats per vertex\n",
 		     sna->render_state.gen5.floats_per_vertex,
-		     op->floats_per_vertex,
-		     sna->render.vertex_index,
-		     (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex));
-		sna->render.vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
-		sna->render.vertex_used = sna->render.vertex_index * op->floats_per_vertex;
+		     op->floats_per_vertex));
+		gen4_vertex_align(sna, op);
 		sna->render_state.gen5.floats_per_vertex = op->floats_per_vertex;
 	}
 }
@@ -942,10 +937,14 @@ gen5_emit_vertex_elements(struct sna *sna,
 inline static void
 gen5_emit_pipe_flush(struct sna *sna)
 {
+#if 0
 	OUT_BATCH(GEN5_PIPE_CONTROL | (4 - 2));
 	OUT_BATCH(GEN5_PIPE_CONTROL_WC_FLUSH);
 	OUT_BATCH(0);
 	OUT_BATCH(0);
+#else
+	OUT_BATCH(MI_FLUSH | MI_INHIBIT_RENDER_CACHE_FLUSH);
+#endif
 }
 
 static void
@@ -1311,11 +1310,12 @@ gen5_render_video(struct sna *sna,
 
 	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL)) {
 		kgem_submit(&sna->kgem);
-		assert(kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL));
+		if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL))
+			return false;
 	}
 
-	gen5_video_bind_surfaces(sna, &tmp);
 	gen5_align_vertex(sna, &tmp);
+	gen5_video_bind_surfaces(sna, &tmp);
 
 	/* Set up the offset for translating from the given region (in screen
 	 * coordinates) to the backing pixmap.
@@ -1452,7 +1452,6 @@ gen5_render_composite(struct sna *sna,
 	}
 
 	if (mask == NULL &&
-	    try_blt(sna, dst, src, width, height) &&
 	    sna_blt_composite(sna, op,
 			      src, dst,
 			      src_x, src_y,
@@ -1577,8 +1576,8 @@ gen5_render_composite(struct sna *sna,
 			goto cleanup_mask;
 	}
 
-	gen5_bind_surfaces(sna, tmp);
 	gen5_align_vertex(sna, tmp);
+	gen5_bind_surfaces(sna, tmp);
 	return true;
 
 cleanup_mask:
@@ -1806,8 +1805,8 @@ gen5_render_composite_spans(struct sna *sna,
 			goto cleanup_src;
 	}
 
-	gen5_bind_surfaces(sna, &tmp->base);
 	gen5_align_vertex(sna, &tmp->base);
+	gen5_bind_surfaces(sna, &tmp->base);
 	return true;
 
 cleanup_src:
@@ -1952,7 +1951,10 @@ fallback_blt:
 		kgem_submit(&sna->kgem);
 		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
 			DBG(("%s: aperture check failed\n", __FUNCTION__));
-			goto fallback_tiled_src;
+			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
+			if (tmp.redirect.real_bo)
+				kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
+			goto fallback_blt;
 		}
 	}
 
@@ -1963,8 +1965,8 @@ fallback_blt:
 	src_dx += tmp.src.offset[0];
 	src_dy += tmp.src.offset[1];
 
-	gen5_copy_bind_surfaces(sna, &tmp);
 	gen5_align_vertex(sna, &tmp);
+	gen5_copy_bind_surfaces(sna, &tmp);
 
 	do {
 		int n_this_time;
@@ -1999,8 +2001,6 @@ fallback_blt:
 	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
 	return true;
 
-fallback_tiled_src:
-	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
 fallback_tiled_dst:
 	if (tmp.redirect.real_bo)
 		kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
@@ -2021,16 +2021,6 @@ fallback_tiled:
 }
 
 #endif
-
-static void
-gen5_render_flush(struct sna *sna)
-{
-	gen4_vertex_close(sna);
-
-	assert(sna->render.vb_id == 0);
-	assert(sna->render.vertex_offset == 0);
-}
-
 static void
 gen5_render_context_switch(struct kgem *kgem,
 			   int new_mode)
@@ -2060,42 +2050,6 @@ gen5_render_context_switch(struct kgem *kgem,
 	}
 }
 
-static void
-discard_vbo(struct sna *sna)
-{
-	kgem_bo_destroy(&sna->kgem, sna->render.vbo);
-	sna->render.vbo = NULL;
-	sna->render.vertices = sna->render.vertex_data;
-	sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
-	sna->render.vertex_used = 0;
-	sna->render.vertex_index = 0;
-}
-
-static void
-gen5_render_retire(struct kgem *kgem)
-{
-	struct sna *sna;
-
-	sna = container_of(kgem, struct sna, kgem);
-	if (kgem->nbatch == 0 && sna->render.vbo && !kgem_bo_is_busy(sna->render.vbo)) {
-		DBG(("%s: resetting idle vbo\n", __FUNCTION__));
-		sna->render.vertex_used = 0;
-		sna->render.vertex_index = 0;
-	}
-}
-
-static void
-gen5_render_expire(struct kgem *kgem)
-{
-	struct sna *sna;
-
-	sna = container_of(kgem, struct sna, kgem);
-	if (sna->render.vbo && !sna->render.vertex_used) {
-		DBG(("%s: discarding vbo\n", __FUNCTION__));
-		discard_vbo(sna);
-	}
-}
-
 static void gen5_render_reset(struct sna *sna)
 {
 	sna->render_state.gen5.needs_invariant = true;
@@ -2107,8 +2061,7 @@ static void gen5_render_reset(struct sna *sna)
 	sna->render_state.gen5.drawrect_limit = -1;
 	sna->render_state.gen5.surface_table = -1;
 
-	if (sna->render.vbo &&
-	    !kgem_bo_is_mappable(&sna->kgem, sna->render.vbo)) {
+	if (sna->render.vbo && !kgem_bo_can_map(&sna->kgem, sna->render.vbo)) {
 		DBG(("%s: discarding unmappable vbo\n", __FUNCTION__));
 		discard_vbo(sna);
 	}
@@ -2351,8 +2304,8 @@ const char *gen5_render_init(struct sna *sna, const char *backend)
 		return backend;
 
 	sna->kgem.context_switch = gen5_render_context_switch;
-	sna->kgem.retire = gen5_render_retire;
-	sna->kgem.expire = gen5_render_expire;
+	sna->kgem.retire = gen4_render_retire;
+	sna->kgem.expire = gen4_render_expire;
 
 #if 0
 #if !NO_COMPOSITE
@@ -2362,7 +2315,7 @@ const char *gen5_render_init(struct sna *sna, const char *backend)
 #if !NO_COMPOSITE_SPANS
 	sna->render.check_composite_spans = gen5_check_composite_spans;
 	sna->render.composite_spans = gen5_render_composite_spans;
-	if (sna->PciInfo->device_id == 0x0044)
+	if (intel_get_device_id(sna->scrn) == 0x0044)
 		sna->render.prefer_gpu |= PREFER_GPU_SPANS;
 #endif
 	sna->render.video = gen5_render_video;
@@ -2378,7 +2331,7 @@ const char *gen5_render_init(struct sna *sna, const char *backend)
     sna->render.blit_tex = gen5_blit_tex;
     sna->render.caps = HW_BIT_BLIT | HW_TEX_BLIT;
 
-	sna->render.flush = gen5_render_flush;
+	sna->render.flush = gen4_render_flush;
 	sna->render.reset = gen5_render_reset;
 	sna->render.fini = gen5_render_fini;
 
@@ -2466,8 +2419,8 @@ gen5_blit_tex(struct sna *sna,
 		kgem_submit(&sna->kgem);
 	}
 
-	gen5_bind_surfaces(sna, tmp);
 	gen5_align_vertex(sna, tmp);
-	return true;
+	gen5_bind_surfaces(sna, tmp);
 
+	return true;
 }
diff --git a/contrib/sdk/sources/Intel-2D/sna/gen6_common.c b/contrib/sdk/sources/Intel-2D/sna/gen6_common.c
new file mode 100644
index 0000000000..8789109f2c
--- /dev/null
+++ b/contrib/sdk/sources/Intel-2D/sna/gen6_common.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright © 2011-2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "gen6_common.h"
+#include "gen4_vertex.h"
+
+void
+gen6_render_context_switch(struct kgem *kgem,
+			   int new_mode)
+{
+	if (kgem->nbatch) {
+		DBG(("%s: from %d to %d, submit batch\n", __FUNCTION__, kgem->mode, new_mode));
+		_kgem_submit(kgem);
+	}
+
+	if (kgem->nexec) {
+		DBG(("%s: from %d to %d, reset incomplete batch\n", __FUNCTION__, kgem->mode, new_mode));
+		kgem_reset(kgem);
+	}
+
+	assert(kgem->nbatch == 0);
+	assert(kgem->nreloc == 0);
+	assert(kgem->nexec == 0);
+
+	kgem->ring = new_mode;
+}
+
+void gen6_render_retire(struct kgem *kgem)
+{
+	struct sna *sna;
+
+	if (kgem->ring && (kgem->has_semaphores || !kgem->need_retire))
+		kgem->ring = kgem->mode;
+
+	sna = container_of(kgem, struct sna, kgem);
+	if (sna->render.nvertex_reloc == 0 &&
+	    sna->render.vbo &&
+	    !kgem_bo_is_busy(sna->render.vbo)) {
+		DBG(("%s: resetting idle vbo\n", __FUNCTION__));
+		sna->render.vertex_used = 0;
+		sna->render.vertex_index = 0;
+	}
+}
diff --git a/contrib/sdk/sources/Intel-2D/sna/gen6_common.h b/contrib/sdk/sources/Intel-2D/sna/gen6_common.h
new file mode 100644
index 0000000000..5ebdf09a95
--- /dev/null
+++ b/contrib/sdk/sources/Intel-2D/sna/gen6_common.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright © 2011-2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+
+#ifndef GEN6_COMMON_H
+#define GEN6_COMMON_H
+
+#include "sna.h"
+
+#define NO_RING_SWITCH 0
+#define PREFER_RENDER 0
+
+static inline bool is_uncached(struct sna *sna,
+			       struct kgem_bo *bo)
+{
+	return bo->scanout && !sna->kgem.has_wt;
+}
+
+inline static bool can_switch_to_blt(struct sna *sna,
+				     struct kgem_bo *bo,
+				     unsigned flags)
+{
+    return false;
+}
+
+inline static bool can_switch_to_render(struct sna *sna,
+					struct kgem_bo *bo)
+{
+	if (sna->kgem.ring == KGEM_RENDER)
+		return true;
+
+	if (NO_RING_SWITCH)
+		return false;
+
+	if (!sna->kgem.has_semaphores)
+		return false;
+
+	if (bo && !RQ_IS_BLT(bo->rq) && !is_uncached(sna, bo))
+		return true;
+
+	return !kgem_ring_is_idle(&sna->kgem, KGEM_RENDER);
+}
+
+static inline bool untiled_tlb_miss(struct kgem_bo *bo)
+{
+	if (kgem_bo_is_render(bo))
+		return false;
+
+	return bo->tiling == I915_TILING_NONE && bo->pitch >= 4096;
+}
+
+static int prefer_blt_bo(struct sna *sna, struct kgem_bo *bo)
+{
+	if (bo->rq)
+		return RQ_IS_BLT(bo->rq);
+
+	if (sna->flags & SNA_POWERSAVE)
+		return true;
+
+	return bo->tiling == I915_TILING_NONE || is_uncached(sna, bo);
+}
+
+inline static bool force_blt_ring(struct sna *sna)
+{
+	if (sna->flags & SNA_POWERSAVE)
+		return true;
+
+	if (sna->kgem.mode == KGEM_RENDER)
+		return false;
+
+	if (sna->render_state.gt < 2)
+		return true;
+
+	return false;
+}
+
+inline static bool prefer_blt_ring(struct sna *sna,
+				   struct kgem_bo *bo,
+				   unsigned flags)
+{
+	assert(!force_blt_ring(sna));
+	assert(!kgem_bo_is_render(bo));
+
+	return can_switch_to_blt(sna, bo, flags);
+}
+
+inline static bool prefer_render_ring(struct sna *sna,
+				      struct kgem_bo *bo)
+{
+	if (sna->flags & SNA_POWERSAVE)
+		return false;
+
+	if (sna->render_state.gt < 2)
+		return false;
+
+	return can_switch_to_render(sna, bo);
+}
+
+inline static bool
+prefer_blt_composite(struct sna *sna, struct sna_composite_op *tmp)
+{
+    return false;
+
+}
+
+static inline bool prefer_blt_fill(struct sna *sna,
+				   struct kgem_bo *bo,
+				   unsigned flags)
+{
+        return false;
+}
+
+void gen6_render_context_switch(struct kgem *kgem, int new_mode);
+void gen6_render_retire(struct kgem *kgem);
+
+#endif /* GEN6_COMMON_H */
diff --git a/contrib/sdk/sources/Intel-2D/sna/gen6_render.c b/contrib/sdk/sources/Intel-2D/sna/gen6_render.c
index 8818017eac..b06238f5cc 100644
--- a/contrib/sdk/sources/Intel-2D/sna/gen6_render.c
+++ b/contrib/sdk/sources/Intel-2D/sna/gen6_render.c
@@ -39,6 +39,8 @@
 
 #include "brw/brw.h"
 #include "gen6_render.h"
+#include "gen6_common.h"
+#include "gen4_common.h"
 #include "gen4_source.h"
 #include "gen4_vertex.h"
 
@@ -74,6 +76,7 @@ struct gt_info {
 		int max_vs_entries;
 		int max_gs_entries;
 	} urb;
+	int gt;
 };
 
 static const struct gt_info gt1_info = {
@@ -82,6 +85,7 @@ static const struct gt_info gt1_info = {
 	.max_gs_threads = 21,
 	.max_wm_threads = 40,
 	.urb = { 32, 256, 256 },
+	.gt = 1,
 };
 
 static const struct gt_info gt2_info = {
@@ -90,6 +94,7 @@ static const struct gt_info gt2_info = {
 	.max_gs_threads = 60,
 	.max_wm_threads = 80,
 	.urb = { 64, 256, 256 },
+	.gt = 2,
 };
 
 static const uint32_t ps_kernel_packed[][4] = {
@@ -872,21 +877,22 @@ gen6_emit_state(struct sna *sna,
 		const struct sna_composite_op *op,
 		uint16_t wm_binding_table)
 {
-	bool need_stall = wm_binding_table & 1;
+	bool need_flush, need_stall;
 
 	assert(op->dst.bo->exec);
 
-	if (gen6_emit_cc(sna, GEN6_BLEND(op->u.gen6.flags)))
-		need_stall = false;
+	need_flush =
+		gen6_emit_cc(sna, GEN6_BLEND(op->u.gen6.flags)) &&
+		wm_binding_table & 1;
 	gen6_emit_sampler(sna, GEN6_SAMPLER(op->u.gen6.flags));
 	gen6_emit_sf(sna, GEN6_VERTEX(op->u.gen6.flags) >> 2);
 	gen6_emit_wm(sna, GEN6_KERNEL(op->u.gen6.flags), GEN6_VERTEX(op->u.gen6.flags) >> 2);
 	gen6_emit_vertex_elements(sna, op);
 
-	need_stall |= gen6_emit_binding_table(sna, wm_binding_table & ~1);
+	need_stall = gen6_emit_binding_table(sna, wm_binding_table & ~1);
 	if (gen6_emit_drawing_rectangle(sna, op))
 		need_stall = false;
-	if (kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo)) {
+	if (need_flush || kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo)) {
         gen6_emit_flush(sna);
         kgem_clear_dirty(&sna->kgem);
 		assert(op->dst.bo->exec);
@@ -1317,16 +1323,10 @@ gen6_align_vertex(struct sna *sna, const struct sna_composite_op *op)
 {
 	assert (sna->render.vertex_offset == 0);
 	if (op->floats_per_vertex != sna->render_state.gen6.floats_per_vertex) {
-		if (sna->render.vertex_size - sna->render.vertex_used < 2*op->floats_per_rect)
-			gen4_vertex_finish(sna);
-
-		DBG(("aligning vertex: was %d, now %d floats per vertex, %d->%d\n",
+		DBG(("aligning vertex: was %d, now %d floats per vertex\n",
 		     sna->render_state.gen6.floats_per_vertex,
-		     op->floats_per_vertex,
-		     sna->render.vertex_index,
-		     (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex));
-		sna->render.vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
-		sna->render.vertex_used = sna->render.vertex_index * op->floats_per_vertex;
+		     op->floats_per_vertex));
+		gen4_vertex_align(sna, op);
 		sna->render_state.gen6.floats_per_vertex = op->floats_per_vertex;
 	}
 	assert((sna->render.vertex_used % op->floats_per_vertex) == 0);
@@ -1657,8 +1657,8 @@ gen6_render_video(struct sna *sna,
 		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	}
 
-	gen6_emit_video_state(sna, &tmp);
 	gen6_align_vertex(sna, &tmp);
+	gen6_emit_video_state(sna, &tmp);
 
 	/* Set up the offset for translating from the given region (in screen
 	 * coordinates) to the backing pixmap.
@@ -1853,9 +1853,9 @@ gen6_composite_set_target(struct sna *sna,
 	} else
 		sna_render_picture_extents(dst, &box);
 
-//	op->dst.bo = sna_drawable_use_bo (dst->pDrawable,
-//					  PREFER_GPU | FORCE_GPU | RENDER_GPU,
-//					  &box, &op->damage);
+	op->dst.bo = sna_drawable_use_bo(dst->pDrawable,
+					 PREFER_GPU | FORCE_GPU | RENDER_GPU,
+					 &box, &op->damage);
 	if (op->dst.bo == NULL)
 		return false;
 
@@ -1925,7 +1925,13 @@ gen6_render_composite(struct sna *sna,
 		return true;
 
 	if (gen6_composite_fallback(sna, src, mask, dst))
-		return false;
+		return (mask == NULL &&
+			sna_blt_composite(sna, op,
+					  src, dst,
+					  src_x, src_y,
+					  dst_x, dst_y,
+					  width, height,
+					  tmp, true));
 
 	if (need_tiling(sna, width, height))
 		return sna_tiling_composite(op, src, mask, dst,
@@ -2051,8 +2057,8 @@ gen6_render_composite(struct sna *sna,
 		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	}
 
+	gen6_align_vertex(sna, tmp);
     gen6_emit_composite_state(sna, tmp);
-    gen6_align_vertex(sna, tmp);
 	return true;
 
 cleanup_mask:
@@ -2284,8 +2290,8 @@ gen6_render_composite_spans(struct sna *sna,
 		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	}
 
-	gen6_emit_composite_state(sna, &tmp->base);
 	gen6_align_vertex(sna, &tmp->base);
+	gen6_emit_composite_state(sna, &tmp->base);
 	return true;
 
 cleanup_src:
@@ -2351,10 +2357,16 @@ static inline bool prefer_blt_copy(struct sna *sna,
 	    untiled_tlb_miss(dst_bo))
 		return true;
 
+	if (force_blt_ring(sna))
+		return true;
+
 	if (kgem_bo_is_render(dst_bo) ||
 	    kgem_bo_is_render(src_bo))
 		return false;
 
+	if (prefer_render_ring(sna, dst_bo))
+		return false;
+
 	if (!prefer_blt_ring(sna, dst_bo, flags))
 		return false;
 
@@ -2553,13 +2565,17 @@ fallback_blt:
 		if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, tmp.src.bo, NULL)) {
 			DBG(("%s: too large for a single operation\n",
 			     __FUNCTION__));
-			goto fallback_tiled_src;
+			if (tmp.src.bo != src_bo)
+				kgem_bo_destroy(&sna->kgem, tmp.src.bo);
+			if (tmp.redirect.real_bo)
+				kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
+			goto fallback_blt;
 		}
 		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	}
 
-	gen6_emit_copy_state(sna, &tmp);
 	gen6_align_vertex(sna, &tmp);
+	gen6_emit_copy_state(sna, &tmp);
 
 	do {
 		int16_t *v;
@@ -2596,9 +2612,6 @@ fallback_blt:
 		kgem_bo_destroy(&sna->kgem, tmp.src.bo);
 	return true;
 
-fallback_tiled_src:
-	if (tmp.src.bo != src_bo)
-		kgem_bo_destroy(&sna->kgem, tmp.src.bo);
 fallback_tiled_dst:
 	if (tmp.redirect.real_bo)
 		kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
@@ -2720,8 +2733,8 @@ fallback:
 		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	}
 
-	gen6_emit_copy_state(sna, &op->base);
 	gen6_align_vertex(sna, &op->base);
+	gen6_emit_copy_state(sna, &op->base);
 
 	op->blt  = gen6_render_copy_blt;
 	op->done = gen6_render_copy_done;
@@ -2760,24 +2773,6 @@ gen6_emit_fill_state(struct sna *sna, const struct sna_composite_op *op)
 	gen6_emit_state(sna, op, offset | dirty);
 }
 
-static inline bool prefer_blt_fill(struct sna *sna,
-				   struct kgem_bo *bo)
-{
-	if (PREFER_RENDER)
-		return PREFER_RENDER < 0;
-
-	if (kgem_bo_is_render(bo))
-		return false;
-
-	if (untiled_tlb_miss(bo))
-		return true;
-
-	if (!prefer_blt_ring(sna, bo, 0))
-		return false;
-
-	return prefer_blt_bo(sna, bo);
-}
-
 static bool
 gen6_render_fill_boxes(struct sna *sna,
 		       CARD8 op,
@@ -2799,7 +2794,8 @@ gen6_render_fill_boxes(struct sna *sna,
 		return false;
 	}
 
-	if (prefer_blt_fill(sna, dst_bo) || !gen6_check_dst_format(format)) {
+	if (prefer_blt_fill(sna, dst_bo, FILL_BOXES) ||
+	    !gen6_check_dst_format(format)) {
 		uint8_t alu = GXinvalid;
 
 		if (op <= PictOpSrc) {
@@ -2874,13 +2870,14 @@ gen6_render_fill_boxes(struct sna *sna,
 	assert(GEN6_SAMPLER(tmp.u.gen6.flags) == FILL_SAMPLER);
 	assert(GEN6_VERTEX(tmp.u.gen6.flags) == FILL_VERTEX);
 
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
 	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
 		kgem_submit(&sna->kgem);
 		assert(kgem_check_bo(&sna->kgem, dst_bo, NULL));
 	}
 
-	gen6_emit_fill_state(sna, &tmp);
 	gen6_align_vertex(sna, &tmp);
+	gen6_emit_fill_state(sna, &tmp);
 
 	do {
 		int n_this_time;
@@ -3009,12 +3006,12 @@ gen6_render_op_fill_done(struct sna *sna, const struct sna_fill_op *op)
 static bool
 gen6_render_fill(struct sna *sna, uint8_t alu,
 		 PixmapPtr dst, struct kgem_bo *dst_bo,
-		 uint32_t color,
+		 uint32_t color, unsigned flags,
 		 struct sna_fill_op *op)
 {
 	DBG(("%s: (alu=%d, color=%x)\n", __FUNCTION__, alu, color));
 
-	if (prefer_blt_fill(sna, dst_bo) &&
+	if (prefer_blt_fill(sna, dst_bo, flags) &&
 	    sna_blt_fill(sna, alu,
 			 dst_bo, dst->drawable.bitsPerPixel,
 			 color,
@@ -3053,13 +3050,14 @@ gen6_render_fill(struct sna *sna, uint8_t alu,
 	assert(GEN6_SAMPLER(op->base.u.gen6.flags) == FILL_SAMPLER);
 	assert(GEN6_VERTEX(op->base.u.gen6.flags) == FILL_VERTEX);
 
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
 	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
 		kgem_submit(&sna->kgem);
 		assert(kgem_check_bo(&sna->kgem, dst_bo, NULL));
 	}
 
-	gen6_emit_fill_state(sna, &op->base);
 	gen6_align_vertex(sna, &op->base);
+	gen6_emit_fill_state(sna, &op->base);
 
 	op->blt  = gen6_render_op_fill_blt;
 	op->box  = gen6_render_op_fill_box;
@@ -3097,7 +3095,7 @@ gen6_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 	int16_t *v;
 
 	/* Prefer to use the BLT if already engaged */
-	if (prefer_blt_fill(sna, bo) &&
+	if (prefer_blt_fill(sna, bo, FILL_BOXES) &&
 	    gen6_render_fill_one_try_blt(sna, dst, bo, color,
 					 x1, y1, x2, y2, alu))
 		return true;
@@ -3133,6 +3131,7 @@ gen6_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 	assert(GEN6_SAMPLER(tmp.u.gen6.flags) == FILL_SAMPLER);
 	assert(GEN6_VERTEX(tmp.u.gen6.flags) == FILL_VERTEX);
 
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, bo);
 	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
 		kgem_submit(&sna->kgem);
 		if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
@@ -3141,8 +3140,8 @@ gen6_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 		}
 	}
 
-	gen6_emit_fill_state(sna, &tmp);
 	gen6_align_vertex(sna, &tmp);
+	gen6_emit_fill_state(sna, &tmp);
 
 	gen6_get_rectangles(sna, &tmp, 1, gen6_emit_fill_state);
 
@@ -3219,6 +3218,7 @@ gen6_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
 	assert(GEN6_SAMPLER(tmp.u.gen6.flags) == FILL_SAMPLER);
 	assert(GEN6_VERTEX(tmp.u.gen6.flags) == FILL_VERTEX);
 
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, bo);
 	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
 		kgem_submit(&sna->kgem);
 		if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
@@ -3227,8 +3227,8 @@ gen6_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
 		}
 	}
 
-	gen6_emit_fill_state(sna, &tmp);
 	gen6_align_vertex(sna, &tmp);
+	gen6_emit_fill_state(sna, &tmp);
 
 	gen6_get_rectangles(sna, &tmp, 1, gen6_emit_fill_state);
 
@@ -3251,60 +3251,6 @@ gen6_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
 }
 #endif
 
-static void gen6_render_flush(struct sna *sna)
-{
-	gen4_vertex_close(sna);
-
-	assert(sna->render.vb_id == 0);
-	assert(sna->render.vertex_offset == 0);
-}
-
-static void
-gen6_render_context_switch(struct kgem *kgem,
-			   int new_mode)
-{
-	if (kgem->nbatch) {
-		DBG(("%s: from %d to %d\n", __FUNCTION__, kgem->mode, new_mode));
-		_kgem_submit(kgem);
-	}
-
-	kgem->ring = new_mode;
-}
-
-static void
-gen6_render_retire(struct kgem *kgem)
-{
-	struct sna *sna;
-
-	if (kgem->ring && (kgem->has_semaphores || !kgem->need_retire))
-		kgem->ring = kgem->mode;
-
-	sna = container_of(kgem, struct sna, kgem);
-	if (kgem->nbatch == 0 && sna->render.vbo && !kgem_bo_is_busy(sna->render.vbo)) {
-		DBG(("%s: resetting idle vbo handle=%d\n", __FUNCTION__, sna->render.vbo->handle));
-		sna->render.vertex_used = 0;
-		sna->render.vertex_index = 0;
-	}
-}
-
-static void
-gen6_render_expire(struct kgem *kgem)
-{
-	struct sna *sna;
-
-	sna = container_of(kgem, struct sna, kgem);
-	if (sna->render.vbo && !sna->render.vertex_used) {
-		DBG(("%s: discarding vbo handle=%d\n", __FUNCTION__, sna->render.vbo->handle));
-		kgem_bo_destroy(kgem, sna->render.vbo);
-		assert(!sna->render.active);
-		sna->render.vbo = NULL;
-		sna->render.vertices = sna->render.vertex_data;
-		sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
-		sna->render.vertex_used = 0;
-		sna->render.vertex_index = 0;
-	}
-}
-
 static void gen6_render_reset(struct sna *sna)
 {
 	sna->render_state.gen6.needs_invariant = true;
@@ -3320,6 +3266,11 @@ static void gen6_render_reset(struct sna *sna)
 	sna->render_state.gen6.drawrect_limit = -1;
 	sna->render_state.gen6.surface_table = -1;
 
+	if (sna->render.vbo && !kgem_bo_can_map(&sna->kgem, sna->render.vbo)) {
+		DBG(("%s: discarding unmappable vbo\n", __FUNCTION__));
+		discard_vbo(sna);
+	}
+
 	sna->render.vertex_offset = 0;
 	sna->render.nvertex_reloc = 0;
 	sna->render.vb_id = 0;
@@ -3330,17 +3281,17 @@ static void gen6_render_fini(struct sna *sna)
     kgem_bo_destroy(&sna->kgem, sna->render_state.gen6.general_bo);
 }
 
-static bool is_gt2(struct sna *sna)
+static bool is_gt2(struct sna *sna, int devid)
 {
-	return sna->PciInfo->device_id & 0x30;
+	return devid & 0x30;
 }
 
-static bool is_mobile(struct sna *sna)
+static bool is_mobile(struct sna *sna, int devid)
 {
-	return (sna->PciInfo->device_id & 0xf) == 0x6;
+	return (devid & 0xf) == 0x6;
 }
 
-static bool gen6_render_setup(struct sna *sna)
+static bool gen6_render_setup(struct sna *sna, int devid)
 {
 	struct gen6_render_state *state = &sna->render_state.gen6;
 	struct sna_static_stream general;
@@ -3348,8 +3299,9 @@ static bool gen6_render_setup(struct sna *sna)
 	int i, j, k, l, m;
 
 	state->info = &gt1_info;
-	if (is_gt2(sna))
+	if (is_gt2(sna, devid))
 		state->info = &gt2_info; /* XXX requires GT_MODE WiZ disabled */
+	state->gt = state->info->gt;
 
     sna_static_stream_init(&general);
 
@@ -3420,12 +3372,14 @@ static bool gen6_render_setup(struct sna *sna)
 
 const char *gen6_render_init(struct sna *sna, const char *backend)
 {
-    if (!gen6_render_setup(sna))
+	int devid = intel_get_device_id(sna);
+
+	if (!gen6_render_setup(sna, devid))
 		return backend;
 
 	sna->kgem.context_switch = gen6_render_context_switch;
 	sna->kgem.retire = gen6_render_retire;
-	sna->kgem.expire = gen6_render_expire;
+	sna->kgem.expire = gen4_render_expire;
 
 #if 0
 #if !NO_COMPOSITE
@@ -3436,7 +3390,7 @@ const char *gen6_render_init(struct sna *sna, const char *backend)
 #if !NO_COMPOSITE_SPANS
 	sna->render.check_composite_spans = gen6_check_composite_spans;
 	sna->render.composite_spans = gen6_render_composite_spans;
-	if (is_mobile(sna))
+	if (is_mobile(sna, devid))
 		sna->render.prefer_gpu |= PREFER_GPU_SPANS;
 #endif
 	sna->render.video = gen6_render_video;
@@ -3465,7 +3419,7 @@ const char *gen6_render_init(struct sna *sna, const char *backend)
     sna->render.caps = HW_BIT_BLIT | HW_TEX_BLIT;
     sna->render.blit_tex = gen6_blit_tex;
 
-    sna->render.flush = gen6_render_flush;
+	sna->render.flush = gen4_render_flush;
     sna->render.reset = gen6_render_reset;
 	sna->render.fini = gen6_render_fini;
 
@@ -3568,7 +3522,7 @@ gen6_blit_tex(struct sna *sna,
 //    tmp->box   = gen6_render_composite_box;
 	tmp->done  = gen6_render_composite_done;
 
-	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp->dst.bo);
+    kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
 	if (!kgem_check_bo(&sna->kgem,
 			   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
 			   NULL)) {
@@ -3576,8 +3530,8 @@ gen6_blit_tex(struct sna *sna,
 		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	}
 
-    gen6_emit_composite_state(sna, tmp);
     gen6_align_vertex(sna, tmp);
-	return true;
+    gen6_emit_composite_state(sna, tmp);
 
+	return true;
 }
diff --git a/contrib/sdk/sources/Intel-2D/sna/gen7_render.c b/contrib/sdk/sources/Intel-2D/sna/gen7_render.c
index 68a5e44f9f..92331a3be0 100644
--- a/contrib/sdk/sources/Intel-2D/sna/gen7_render.c
+++ b/contrib/sdk/sources/Intel-2D/sna/gen7_render.c
@@ -42,10 +42,14 @@
 
 #include "brw/brw.h"
 #include "gen7_render.h"
+#include "gen4_common.h"
 #include "gen4_source.h"
 #include "gen4_vertex.h"
+#include "gen6_common.h"
 
+#define ALWAYS_INVALIDATE 0
 #define ALWAYS_FLUSH 0
+#define ALWAYS_STALL 0
 
 #define NO_COMPOSITE 0
 #define NO_COMPOSITE_SPANS 0
@@ -1022,33 +1026,51 @@ gen7_emit_state(struct sna *sna,
 		const struct sna_composite_op *op,
 		uint16_t wm_binding_table)
 {
+	bool need_invalidate;
+	bool need_flush;
 	bool need_stall;
 
 	assert(op->dst.bo->exec);
 
+	need_invalidate = kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo);
+	if (ALWAYS_INVALIDATE)
+		need_invalidate = true;
+
+	need_flush =
+		sna->render_state.gen7.emit_flush &&
+		wm_binding_table & GEN7_READS_DST(op->u.gen7.flags);
+	if (ALWAYS_FLUSH)
+		need_flush = true;
+
+	wm_binding_table &= ~1;
+
+	need_stall = sna->render_state.gen7.surface_table != wm_binding_table;
+	need_stall &= gen7_emit_drawing_rectangle(sna, op);
+	if (ALWAYS_STALL)
+		need_stall = true;
+
+	if (need_invalidate) {
+		gen7_emit_pipe_invalidate(sna);
+		kgem_clear_dirty(&sna->kgem);
+		assert(op->dst.bo->exec);
+			kgem_bo_mark_dirty(op->dst.bo);
+
+		need_flush = false;
+		need_stall = false;
+	}
+	if (need_flush) {
+		gen7_emit_pipe_flush(sna, need_stall);
+		need_stall = false;
+	}
+	if (need_stall)
+		gen7_emit_pipe_stall(sna);
+
 	gen7_emit_cc(sna, GEN7_BLEND(op->u.gen7.flags));
 	gen7_emit_sampler(sna, GEN7_SAMPLER(op->u.gen7.flags));
 	gen7_emit_sf(sna, GEN7_VERTEX(op->u.gen7.flags) >> 2);
 	gen7_emit_wm(sna, GEN7_KERNEL(op->u.gen7.flags));
 	gen7_emit_vertex_elements(sna, op);
-
-	need_stall = gen7_emit_binding_table(sna, wm_binding_table);
-	need_stall &= gen7_emit_drawing_rectangle(sna, op);
-
-	if (ALWAYS_FLUSH || kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo)) {
-		gen7_emit_pipe_invalidate(sna);
-		kgem_clear_dirty(&sna->kgem);
-		assert(op->dst.bo->exec);
-			kgem_bo_mark_dirty(op->dst.bo);
-		sna->render_state.gen7.emit_flush = false;
-		need_stall = false;
-	}
-	if (sna->render_state.gen7.emit_flush) {
-		gen7_emit_pipe_flush(sna, need_stall);
-		need_stall = false;
-	}
-	if (need_stall)
-		gen7_emit_pipe_stall(sna);
+	gen7_emit_binding_table(sna, wm_binding_table);
 
 	sna->render_state.gen7.emit_flush = GEN7_READS_DST(op->u.gen7.flags);
 }
@@ -1404,12 +1426,14 @@ static void gen7_emit_composite_state(struct sna *sna,
 				      const struct sna_composite_op *op)
 {
 	uint32_t *binding_table;
-	uint16_t offset;
+	uint16_t offset, dirty;
 
 	gen7_get_batch(sna, op);
 
 	binding_table = gen7_composite_get_binding_table(sna, &offset);
 
+	dirty = kgem_bo_is_dirty(op->dst.bo);
+
 	binding_table[0] =
 		gen7_bind_bo(sna,
 			    op->dst.bo, op->dst.width, op->dst.height,
@@ -1438,23 +1462,16 @@ static void gen7_emit_composite_state(struct sna *sna,
 		offset = sna->render_state.gen7.surface_table;
 	}
 
-	gen7_emit_state(sna, op, offset);
+	gen7_emit_state(sna, op, offset | dirty);
 }
 
 static void
 gen7_align_vertex(struct sna *sna, const struct sna_composite_op *op)
 {
 	if (op->floats_per_vertex != sna->render_state.gen7.floats_per_vertex) {
-		if (sna->render.vertex_size - sna->render.vertex_used < 2*op->floats_per_rect)
-			gen4_vertex_finish(sna);
-
-		DBG(("aligning vertex: was %d, now %d floats per vertex, %d->%d\n",
-		     sna->render_state.gen7.floats_per_vertex,
-		     op->floats_per_vertex,
-		     sna->render.vertex_index,
-		     (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex));
-		sna->render.vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
-		sna->render.vertex_used = sna->render.vertex_index * op->floats_per_vertex;
+		DBG(("aligning vertex: was %d, now %d floats per vertex\n",
+		     sna->render_state.gen7.floats_per_vertex, op->floats_per_vertex));
+		gen4_vertex_align(sna, op);
 		sna->render_state.gen7.floats_per_vertex = op->floats_per_vertex;
 	}
 }
@@ -1548,7 +1565,7 @@ static void gen7_emit_video_state(struct sna *sna,
 	int src_height[6];
 	int src_pitch[6];
 	uint32_t *binding_table;
-	uint16_t offset;
+	uint16_t offset, dirty;
 	int n_src, n;
 
 	gen7_get_batch(sna, op);
@@ -1586,6 +1603,8 @@ static void gen7_emit_video_state(struct sna *sna,
 
 	binding_table = gen7_composite_get_binding_table(sna, &offset);
 
+	dirty = kgem_bo_is_dirty(op->dst.bo);
+
 	binding_table[0] =
 		gen7_bind_bo(sna,
 			     op->dst.bo, op->dst.width, op->dst.height,
@@ -1602,7 +1621,7 @@ static void gen7_emit_video_state(struct sna *sna,
 					       src_surf_format);
 	}
 
-	gen7_emit_state(sna, op, offset);
+	gen7_emit_state(sna, op, offset | dirty);
 }
 
 static bool
@@ -1669,12 +1688,14 @@ gen7_render_video(struct sna *sna,
 	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp.dst.bo);
 	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL)) {
 		kgem_submit(&sna->kgem);
-		assert(kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL));
+		if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL))
+			return false;
+
 		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	}
 
-	gen7_emit_video_state(sna, &tmp);
 	gen7_align_vertex(sna, &tmp);
+	gen7_emit_video_state(sna, &tmp);
 
 	/* Set up the offset for translating from the given region (in screen
 	 * coordinates) to the backing pixmap.
@@ -1874,7 +1895,8 @@ gen7_render_fill_boxes(struct sna *sna,
 		return false;
 	}
 
-	if (prefer_blt_fill(sna, dst_bo) || !gen7_check_dst_format(format)) {
+	if (prefer_blt_fill(sna, dst_bo, FILL_BOXES) ||
+	    !gen7_check_dst_format(format)) {
 		uint8_t alu = GXinvalid;
 
 		if (op <= PictOpSrc) {
@@ -1949,11 +1971,17 @@ gen7_render_fill_boxes(struct sna *sna,
 	kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
 	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
 		kgem_submit(&sna->kgem);
-		assert(kgem_check_bo(&sna->kgem, dst_bo, NULL));
+		if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
+			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
+			if (tmp.redirect.real_bo)
+				kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
+			return false;
+		}
+		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	}
 
-	gen7_emit_fill_state(sna, &tmp);
 	gen7_align_vertex(sna, &tmp);
+	gen7_emit_fill_state(sna, &tmp);
 
 	do {
 		int n_this_time;
@@ -1987,60 +2015,6 @@ gen7_render_fill_boxes(struct sna *sna,
 }
 #endif
 
-static void gen7_render_flush(struct sna *sna)
-{
-	gen4_vertex_close(sna);
-
-	assert(sna->render.vb_id == 0);
-	assert(sna->render.vertex_offset == 0);
-}
-
-static void
-gen7_render_context_switch(struct kgem *kgem,
-			   int new_mode)
-{
-	if (kgem->nbatch) {
-		DBG(("%s: switch rings %d -> %d\n",
-		     __FUNCTION__, kgem->mode, new_mode));
-		_kgem_submit(kgem);
-	}
-
-	kgem->ring = new_mode;
-}
-
-static void
-gen7_render_retire(struct kgem *kgem)
-{
-	struct sna *sna;
-
-	if (kgem->ring && (kgem->has_semaphores || !kgem->need_retire))
-		kgem->ring = kgem->mode;
-
-	sna = container_of(kgem, struct sna, kgem);
-	if (kgem->nbatch == 0 && sna->render.vbo && !kgem_bo_is_busy(sna->render.vbo)) {
-		DBG(("%s: resetting idle vbo\n", __FUNCTION__));
-		sna->render.vertex_used = 0;
-		sna->render.vertex_index = 0;
-	}
-}
-
-static void
-gen7_render_expire(struct kgem *kgem)
-{
-	struct sna *sna;
-
-	sna = container_of(kgem, struct sna, kgem);
-	if (sna->render.vbo && !sna->render.vertex_used) {
-		DBG(("%s: discarding vbo\n", __FUNCTION__));
-		kgem_bo_destroy(kgem, sna->render.vbo);
-		sna->render.vbo = NULL;
-		sna->render.vertices = sna->render.vertex_data;
-		sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
-		sna->render.vertex_used = 0;
-		sna->render.vertex_index = 0;
-	}
-}
-
 static void gen7_render_reset(struct sna *sna)
 {
 	sna->render_state.gen7.emit_flush = false;
@@ -2056,6 +2030,11 @@ static void gen7_render_reset(struct sna *sna)
 	sna->render_state.gen7.drawrect_limit = -1;
 	sna->render_state.gen7.surface_table = -1;
 
+	if (sna->render.vbo && !kgem_bo_can_map(&sna->kgem, sna->render.vbo)) {
+		DBG(("%s: discarding unmappable vbo\n", __FUNCTION__));
+		discard_vbo(sna);
+	}
+
 	sna->render.vertex_offset = 0;
 	sna->render.nvertex_reloc = 0;
 	sna->render.vb_id = 0;
@@ -2066,23 +2045,23 @@ static void gen7_render_fini(struct sna *sna)
 	kgem_bo_destroy(&sna->kgem, sna->render_state.gen7.general_bo);
 }
 
-static bool is_gt3(struct sna *sna)
+static bool is_gt3(struct sna *sna, int devid)
 {
 	assert(sna->kgem.gen == 075);
-	return sna->PciInfo->device_id & 0x20;
+	return devid & 0x20;
 }
 
-static bool is_gt2(struct sna *sna)
+static bool is_gt2(struct sna *sna, int devid)
 {
-	return sna->PciInfo->device_id & (is_hsw(sna)? 0x30 : 0x20);
+	return devid & (is_hsw(sna)? 0x30 : 0x20);
 }
 
-static bool is_mobile(struct sna *sna)
+static bool is_mobile(struct sna *sna, int devid)
 {
-	return (sna->PciInfo->device_id & 0xf) == 0x6;
+	return (devid & 0xf) == 0x6;
 }
 
-static bool gen7_render_setup(struct sna *sna)
+static bool gen7_render_setup(struct sna *sna, int devid)
 {
     struct gen7_render_state *state = &sna->render_state.gen7;
     struct sna_static_stream general;
@@ -2091,19 +2070,19 @@ static bool gen7_render_setup(struct sna *sna)
 
 	if (is_ivb(sna)) {
         state->info = &ivb_gt_info;
-		if (sna->PciInfo->device_id & 0xf) {
+		if (devid & 0xf) {
             state->info = &ivb_gt1_info;
-            if (is_gt2(sna))
+			if (is_gt2(sna, devid))
                 state->info = &ivb_gt2_info; /* XXX requires GT_MODE WiZ disabled */
         }
 	} else if (is_byt(sna)) {
 		state->info = &byt_gt_info;
 	} else if (is_hsw(sna)) {
         state->info = &hsw_gt_info;
-		if (sna->PciInfo->device_id & 0xf) {
-			if (is_gt3(sna))
+		if (devid & 0xf) {
+			if (is_gt3(sna, devid))
 				state->info = &hsw_gt3_info;
-			else if (is_gt2(sna))
+			else if (is_gt2(sna, devid))
 				state->info = &hsw_gt2_info;
 			else
             state->info = &hsw_gt1_info;
@@ -2111,6 +2090,8 @@ static bool gen7_render_setup(struct sna *sna)
     } else
         return false;
 
+	state->gt = state->info->gt;
+
     sna_static_stream_init(&general);
 
     /* Zero pad the start. If you see an offset of 0x0 in the batchbuffer
@@ -2175,12 +2156,14 @@ static bool gen7_render_setup(struct sna *sna)
 
 const char *gen7_render_init(struct sna *sna, const char *backend)
 {
-    if (!gen7_render_setup(sna))
+	int devid = intel_get_device_id(sna);
+
+	if (!gen7_render_setup(sna, devid))
 		return backend;
 
-    sna->kgem.context_switch = gen7_render_context_switch;
-    sna->kgem.retire = gen7_render_retire;
-    sna->kgem.expire = gen7_render_expire;
+	sna->kgem.context_switch = gen6_render_context_switch;
+	sna->kgem.retire = gen6_render_retire;
+	sna->kgem.expire = gen4_render_expire;
 
 #if 0
 #if !NO_COMPOSITE
@@ -2190,7 +2173,7 @@ const char *gen7_render_init(struct sna *sna, const char *backend)
 #if !NO_COMPOSITE_SPANS
 	sna->render.check_composite_spans = gen7_check_composite_spans;
 	sna->render.composite_spans = gen7_render_composite_spans;
-	if (is_mobile(sna) || is_gt2(sna) || is_byt(sna))
+	if (is_mobile(sna, devid) || is_gt2(sna, devid) || is_byt(sna))
 		sna->render.prefer_gpu |= PREFER_GPU_SPANS;
 #endif
 	sna->render.video = gen7_render_video;
@@ -2219,7 +2202,7 @@ const char *gen7_render_init(struct sna *sna, const char *backend)
     sna->render.blit_tex = gen7_blit_tex;
     sna->render.caps = HW_BIT_BLIT | HW_TEX_BLIT;
 
-    sna->render.flush = gen7_render_flush;
+	sna->render.flush = gen4_render_flush;
     sna->render.reset = gen7_render_reset;
     sna->render.fini = gen7_render_fini;
 
@@ -2312,7 +2295,7 @@ gen7_blit_tex(struct sna *sna,
 //	tmp->box   = gen7_render_composite_box;
 	tmp->done  = gen7_render_composite_done;
 
-	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp->dst.bo);
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
 	if (!kgem_check_bo(&sna->kgem,
 			   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
 			   NULL)) {
@@ -2320,7 +2303,7 @@ gen7_blit_tex(struct sna *sna,
 		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	}
 
-	gen7_emit_composite_state(sna, tmp);
 	gen7_align_vertex(sna, tmp);
+	gen7_emit_composite_state(sna, tmp);
 	return true;
 }
diff --git a/contrib/sdk/sources/Intel-2D/sna/kgem.c b/contrib/sdk/sources/Intel-2D/sna/kgem.c
index 5943a24d7c..84d5e8cfb8 100644
--- a/contrib/sdk/sources/Intel-2D/sna/kgem.c
+++ b/contrib/sdk/sources/Intel-2D/sna/kgem.c
@@ -47,7 +47,6 @@
 
 #include "sna_cpuid.h"
 
-
 static struct kgem_bo *
 search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags);
 
@@ -60,7 +59,7 @@ search_snoop_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags);
 #define DBG_NO_CACHE_LEVEL 0
 #define DBG_NO_CPU 0
 #define DBG_NO_CREATE2 1
-#define DBG_NO_USERPTR 0
+#define DBG_NO_USERPTR 1
 #define DBG_NO_UNSYNCHRONIZED_USERPTR 0
 #define DBG_NO_LLC 0
 #define DBG_NO_SEMAPHORES 0
@@ -72,7 +71,7 @@ search_snoop_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags);
 #define DBG_NO_SECURE_BATCHES 0
 #define DBG_NO_PINNED_BATCHES 0
 #define DBG_NO_FAST_RELOC 0
-#define DBG_NO_HANDLE_LUT 1
+#define DBG_NO_HANDLE_LUT 0
 #define DBG_NO_WT 0
 #define DBG_DUMP 0
 
@@ -105,10 +104,8 @@ search_snoop_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags);
 #define MAX_CPU_VMA_CACHE INT16_MAX
 #define MAP_PRESERVE_TIME 10
 
-#define MAKE_CPU_MAP(ptr) ((void*)((uintptr_t)(ptr) | 1))
-#define MAKE_USER_MAP(ptr) ((void*)((uintptr_t)(ptr) | 3))
-#define IS_USER_MAP(ptr) ((uintptr_t)(ptr) & 2)
-#define __MAP_TYPE(ptr) ((uintptr_t)(ptr) & 3)
+#define MAKE_USER_MAP(ptr) ((void*)((uintptr_t)(ptr) | 1))
+#define IS_USER_MAP(ptr) ((uintptr_t)(ptr) & 1)
 
 #define MAKE_REQUEST(rq, ring) ((struct kgem_request *)((uintptr_t)(rq) | (ring)))
 
@@ -158,7 +155,12 @@ struct kgem_buffer {
 	uint32_t used;
 	uint32_t need_io : 1;
 	uint32_t write : 2;
-	uint32_t mmapped : 1;
+	uint32_t mmapped : 2;
+};
+enum {
+	MMAPPED_NONE,
+	MMAPPED_GTT,
+	MMAPPED_CPU
 };
 
 static struct kgem_bo *__kgem_freed_bo;
@@ -252,9 +254,10 @@ static bool gem_set_caching(int fd, uint32_t handle, int caching)
 	return drmIoctl(fd, LOCAL_IOCTL_I915_GEM_SET_CACHING, &arg) == 0;
 }
 
-
-
-
+static uint32_t gem_userptr(int fd, void *ptr, int size, int read_only)
+{
+    return 0;
+}
 
 static bool __kgem_throttle_retire(struct kgem *kgem, unsigned flags)
 {
@@ -289,24 +292,23 @@ static void *__kgem_bo_map__gtt(struct kgem *kgem, struct kgem_bo *bo)
 	     bo->handle, bytes(bo)));
 	assert(bo->proxy == NULL);
 	assert(!bo->snoop);
-	assert(kgem_bo_can_map(kgem, bo));
+	assert(num_pages(bo) <= kgem->aperture_mappable / 4);
 
 retry_gtt:
 	VG_CLEAR(mmap_arg);
 	mmap_arg.handle = bo->handle;
 	if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_MMAP_GTT, &mmap_arg)) {
+		int err = 0;
 
 		(void)__kgem_throttle_retire(kgem, 0);
 		if (kgem_expire_cache(kgem))
 			goto retry_gtt;
 
-		if (kgem->need_expire) {
-			kgem_cleanup_cache(kgem);
+		if (kgem_cleanup_cache(kgem))
 			goto retry_gtt;
-		}
 
-		printf("%s: failed to retrieve GTT offset for handle=%d\n",
-		       __FUNCTION__, bo->handle);
+		ErrorF("%s: failed to retrieve GTT offset for handle=%d: %d\n",
+		       __FUNCTION__, bo->handle, err);
 		return NULL;
 	}
 
@@ -321,7 +323,7 @@ retry_mmap:
 	return ptr;
 }
 
-static int __gem_write(int fd, uint32_t handle,
+static int gem_write(int fd, uint32_t handle,
 		       int offset, int length,
 		       const void *src)
 {
@@ -338,7 +340,7 @@ static int __gem_write(int fd, uint32_t handle,
 	return drmIoctl(fd, DRM_IOCTL_I915_GEM_PWRITE, &pwrite);
 }
 
-static int gem_write(int fd, uint32_t handle,
+static int gem_write__cachealigned(int fd, uint32_t handle,
 		     int offset, int length,
 		     const void *src)
 {
@@ -631,7 +633,7 @@ total_ram_size(void)
 static unsigned
 cpu_cache_size__cpuid4(void)
 {
-	/* Deterministic Cache Parmaeters (Function 04h)":
+	/* Deterministic Cache Parameters (Function 04h)":
 	 *    When EAX is initialized to a value of 4, the CPUID instruction
 	 *    returns deterministic cache information in the EAX, EBX, ECX
 	 *    and EDX registers.  This function requires ECX be initialized
@@ -755,7 +757,7 @@ static bool is_hw_supported(struct kgem *kgem,
 	 * hw acceleration.
 	 */
 
-	if (kgem->gen == 060 && dev->revision < 8) {
+	if (kgem->gen == 060 && dev && dev->revision < 8) {
 		/* pre-production SNB with dysfunctional BLT */
 		return false;
 	}
@@ -881,7 +883,7 @@ static bool test_has_pinned_batches(struct kgem *kgem)
 
 static bool kgem_init_pinned_batches(struct kgem *kgem)
 {
-	int count[2] = { 2, 2 };
+	int count[2] = { 4, 4 };
 	int size[2] = { 1, 2 };
 	int n, i;
 
@@ -911,6 +913,7 @@ static bool kgem_init_pinned_batches(struct kgem *kgem)
 			pin.alignment = 0;
 			if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_PIN, &pin)) {
 				gem_close(kgem->fd, pin.handle);
+				free(bo);
 				goto err;
 			}
 			bo->presumed_offset = pin.offset;
@@ -1028,7 +1031,6 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, unsigned gen)
          kgem->has_no_reloc));
 
     kgem->has_handle_lut = test_has_handle_lut(kgem);
-    kgem->has_handle_lut = 0;
     DBG(("%s: has handle-lut? %d\n", __FUNCTION__,
          kgem->has_handle_lut));
 
@@ -1042,6 +1044,10 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, unsigned gen)
     DBG(("%s: can blt to cpu? %d\n", __FUNCTION__,
          kgem->can_blt_cpu));
 
+	kgem->can_render_y = gen != 021 && (gen >> 3) != 4;
+	DBG(("%s: can render to Y-tiled surfaces? %d\n", __FUNCTION__,
+	     kgem->can_render_y));
+
     kgem->has_secure_batches = test_has_secure_batches(kgem);
     DBG(("%s: can use privileged batchbuffers? %d\n", __FUNCTION__,
          kgem->has_secure_batches));
@@ -1115,6 +1121,8 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, unsigned gen)
          kgem->aperture_low, kgem->aperture_low / (1024*1024),
          kgem->aperture_high, kgem->aperture_high / (1024*1024)));
 
+	kgem->aperture_mappable = 256 * 1024 * 1024;
+	if (dev != NULL)
     kgem->aperture_mappable = agp_aperture_size(dev, gen);
     if (kgem->aperture_mappable == 0 ||
         kgem->aperture_mappable > aperture.aper_size)
@@ -1149,6 +1157,14 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, unsigned gen)
     if (kgem->max_gpu_size > totalram / 4)
         kgem->max_gpu_size = totalram / 4;
 
+	if (kgem->aperture_high > totalram / 2) {
+		kgem->aperture_high = totalram / 2;
+		kgem->aperture_low = kgem->aperture_high / 4;
+		DBG(("%s: reduced aperture watermaks to fit into ram; low=%d [%d], high=%d [%d]\n", __FUNCTION__,
+		     kgem->aperture_low, kgem->aperture_low / (1024*1024),
+		     kgem->aperture_high, kgem->aperture_high / (1024*1024)));
+	}
+
     kgem->max_cpu_size = kgem->max_object_size;
 
     half_gpu_max = kgem->max_gpu_size / 2;
@@ -1197,8 +1213,10 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, unsigned gen)
          kgem->max_upload_tile_size, kgem->max_copy_tile_size));
 
     /* Convert the aperture thresholds to pages */
+	kgem->aperture_mappable /= PAGE_SIZE;
     kgem->aperture_low /= PAGE_SIZE;
     kgem->aperture_high /= PAGE_SIZE;
+	kgem->aperture_total /= PAGE_SIZE;
 
     kgem->fence_max = gem_param(kgem, I915_PARAM_NUM_FENCES_AVAIL) - 2;
     if ((int)kgem->fence_max < 0)
@@ -1233,7 +1251,7 @@ inline static uint32_t kgem_pitch_alignment(struct kgem *kgem, unsigned flags)
 	return kgem->min_alignment;
 }
 
-void kgem_get_tile_size(struct kgem *kgem, int tiling,
+void kgem_get_tile_size(struct kgem *kgem, int tiling, int pitch,
 			int *tile_width, int *tile_height, int *tile_size)
 {
 	if (kgem->gen <= 030) {
@@ -1270,6 +1288,10 @@ void kgem_get_tile_size(struct kgem *kgem, int tiling,
 		*tile_size = 4096;
 		break;
 	}
+
+	/* Force offset alignment to tile-row */
+	if (tiling && kgem->gen < 033)
+		*tile_width = pitch;
 }
 
 uint32_t kgem_surface_size(struct kgem *kgem,
@@ -1400,10 +1422,15 @@ kgem_add_handle(struct kgem *kgem, struct kgem_bo *bo)
 
 static void kgem_add_bo(struct kgem *kgem, struct kgem_bo *bo)
 {
+	assert(bo->refcnt);
+	assert(bo->proxy == NULL);
+
 	bo->exec = kgem_add_handle(kgem, bo);
 	bo->rq = MAKE_REQUEST(kgem->next_request, kgem->ring);
 
 	list_move_tail(&bo->request, &kgem->next_request->buffers);
+	if (bo->io && !list_is_empty(&bo->list))
+		list_move(&bo->list, &kgem->batch_buffers);
 
 	/* XXX is it worth working around gcc here? */
 	kgem->flush |= bo->flush;
@@ -1456,31 +1483,11 @@ static void kgem_bo_binding_free(struct kgem *kgem, struct kgem_bo *bo)
 	b = bo->binding.next;
 	while (b) {
 		struct kgem_bo_binding *next = b->next;
-		free (b);
+		free(b);
 		b = next;
 	}
 }
 
-static void kgem_bo_release_map(struct kgem *kgem, struct kgem_bo *bo)
-{
-	int type = IS_CPU_MAP(bo->map);
-
-	assert(!IS_USER_MAP(bo->map));
-
-	DBG(("%s: releasing %s vma for handle=%d, count=%d\n",
-	     __FUNCTION__, type ? "CPU" : "GTT",
-	     bo->handle, kgem->vma[type].count));
-
-	VG(if (type) VALGRIND_MAKE_MEM_NOACCESS(MAP(bo->map), bytes(bo)));
-	user_free(MAP(bo->map));
-	bo->map = NULL;
-
-	if (!list_is_empty(&bo->vma)) {
-		list_del(&bo->vma);
-		kgem->vma[type].count--;
-	}
-}
-
 static void kgem_bo_free(struct kgem *kgem, struct kgem_bo *bo)
 {
 	DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
@@ -1496,21 +1503,31 @@ static void kgem_bo_free(struct kgem *kgem, struct kgem_bo *bo)
 
 	kgem_bo_binding_free(kgem, bo);
 
-	if (IS_USER_MAP(bo->map)) {
+	if (IS_USER_MAP(bo->map__cpu)) {
 		assert(bo->rq == NULL);
 		assert(!__kgem_busy(kgem, bo->handle));
-		assert(MAP(bo->map) != bo || bo->io || bo->flush);
+		assert(MAP(bo->map__cpu) != bo || bo->io || bo->flush);
 		if (!(bo->io || bo->flush)) {
 			DBG(("%s: freeing snooped base\n", __FUNCTION__));
-			assert(bo != MAP(bo->map));
-			free(MAP(bo->map));
+			assert(bo != MAP(bo->map__cpu));
+			free(MAP(bo->map__cpu));
 		}
-		bo->map = NULL;
+		bo->map__cpu = NULL;
 	}
-	if (bo->map)
-		kgem_bo_release_map(kgem, bo);
-	assert(list_is_empty(&bo->vma));
-	assert(bo->map == NULL);
+
+	DBG(("%s: releasing %p:%p vma for handle=%d, count=%d\n",
+	     __FUNCTION__, bo->map__gtt, bo->map__cpu,
+	     bo->handle, list_is_empty(&bo->vma) ? 0 : kgem->vma[bo->map__gtt == NULL].count));
+
+	if (!list_is_empty(&bo->vma)) {
+		_list_del(&bo->vma);
+		kgem->vma[bo->map__gtt == NULL].count--;
+	}
+
+//   if (bo->map__gtt)
+//       munmap(MAP(bo->map__gtt), bytes(bo));
+//   if (bo->map__cpu)
+//       munmap(MAP(bo->map__cpu), bytes(bo));
 
 	_list_del(&bo->list);
 	_list_del(&bo->request);
@@ -1546,22 +1563,28 @@ inline static void kgem_bo_move_to_inactive(struct kgem *kgem,
 	kgem->need_expire = true;
 
 	if (bucket(bo) >= NUM_CACHE_BUCKETS) {
-		list_move(&bo->list, &kgem->large_inactive);
-		return;
+		if (bo->map__gtt) {
+//           munmap(MAP(bo->map__gtt), bytes(bo));
+			bo->map__gtt = NULL;
 	}
 
+		list_move(&bo->list, &kgem->large_inactive);
+	} else {
 	assert(bo->flush == false);
 	list_move(&bo->list, &kgem->inactive[bucket(bo)]);
-	if (bo->map) {
-		int type = IS_CPU_MAP(bo->map);
-		if (bucket(bo) >= NUM_CACHE_BUCKETS ||
-		    (!type && !__kgem_bo_is_mappable(kgem, bo))) {
-//			munmap(MAP(bo->map), bytes(bo));
-			bo->map = NULL;
+		if (bo->map__gtt) {
+			if (!kgem_bo_can_map(kgem, bo)) {
+//				munmap(MAP(bo->map__gtt), bytes(bo));
+				bo->map__gtt = NULL;
+			}
+			if (bo->map__gtt) {
+				list_add(&bo->vma, &kgem->vma[0].inactive[bucket(bo)]);
+				kgem->vma[0].count++;
+			}
 		}
-		if (bo->map) {
-			list_add(&bo->vma, &kgem->vma[type].inactive[bucket(bo)]);
-			kgem->vma[type].count++;
+		if (bo->map__cpu && !bo->map__gtt) {
+			list_add(&bo->vma, &kgem->vma[1].inactive[bucket(bo)]);
+			kgem->vma[1].count++;
 		}
 	}
 }
@@ -1574,6 +1597,10 @@ static struct kgem_bo *kgem_bo_replace_io(struct kgem_bo *bo)
 		return bo;
 
 	assert(!bo->snoop);
+	if (__kgem_freed_bo) {
+		base = __kgem_freed_bo;
+		__kgem_freed_bo = *(struct kgem_bo **)base;
+	} else
 	base = malloc(sizeof(*base));
 	if (base) {
 		DBG(("%s: transferring io handle=%d to bo\n",
@@ -1600,10 +1627,10 @@ inline static void kgem_bo_remove_from_inactive(struct kgem *kgem,
 	list_del(&bo->list);
 	assert(bo->rq == NULL);
 	assert(bo->exec == NULL);
-	if (bo->map) {
-		assert(!list_is_empty(&bo->vma));
+	if (!list_is_empty(&bo->vma)) {
+		assert(bo->map__gtt || bo->map__cpu);
 		list_del(&bo->vma);
-		kgem->vma[IS_CPU_MAP(bo->map)].count--;
+		kgem->vma[bo->map__gtt == NULL].count--;
 	}
 }
 
@@ -1614,8 +1641,10 @@ inline static void kgem_bo_remove_from_active(struct kgem *kgem,
 
 	list_del(&bo->list);
 	assert(bo->rq != NULL);
-	if (bo->rq == (void *)kgem)
+	if (RQ(bo->rq) == (void *)kgem) {
+		assert(bo->exec == NULL);
 		list_del(&bo->request);
+	}
 	assert(list_is_empty(&bo->vma));
 }
 
@@ -1740,6 +1769,7 @@ void kgem_bo_undo(struct kgem *kgem, struct kgem_bo *bo)
 	if (kgem->nexec != 1 || bo->exec == NULL)
 		return;
 
+	assert(bo);
 	DBG(("%s: only handle in batch, discarding last operations for handle=%d\n",
 	     __FUNCTION__, bo->handle));
 
@@ -1750,6 +1780,10 @@ void kgem_bo_undo(struct kgem *kgem, struct kgem_bo *bo)
 	bo->refcnt++;
 	kgem_reset(kgem);
 	bo->refcnt--;
+
+	assert(kgem->nreloc == 0);
+	assert(kgem->nexec == 0);
+	assert(bo->exec == NULL);
 }
 
 static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
@@ -1777,7 +1811,7 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 			kgem_bo_move_to_snoop(kgem, bo);
 		return;
 	}
-	if (!IS_USER_MAP(bo->map))
+	if (!IS_USER_MAP(bo->map__cpu))
 		bo->flush = false;
 
 	if (bo->scanout) {
@@ -1793,9 +1827,6 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 		goto destroy;
 	}
 
-	if (!kgem->has_llc && IS_CPU_MAP(bo->map) && bo->domain != DOMAIN_CPU)
-		kgem_bo_release_map(kgem, bo);
-
 	assert(list_is_empty(&bo->vma));
 	assert(list_is_empty(&bo->list));
 	assert(bo->flush == false);
@@ -1824,7 +1855,7 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 	assert(bo->exec == NULL);
 	assert(list_is_empty(&bo->request));
 
-	if (!IS_CPU_MAP(bo->map)) {
+	if (bo->map__cpu == NULL || bucket(bo) >= NUM_CACHE_BUCKETS) {
 		if (!kgem_bo_set_purgeable(kgem, bo))
 			goto destroy;
 
@@ -1852,16 +1883,18 @@ static void kgem_bo_unref(struct kgem *kgem, struct kgem_bo *bo)
 
 static void kgem_buffer_release(struct kgem *kgem, struct kgem_buffer *bo)
 {
+	assert(bo->base.io);
 	while (!list_is_empty(&bo->base.vma)) {
 		struct kgem_bo *cached;
 
 		cached = list_first_entry(&bo->base.vma, struct kgem_bo, vma);
 		assert(cached->proxy == &bo->base);
+		assert(cached != &bo->base);
 		list_del(&cached->vma);
 
-		assert(*(struct kgem_bo **)cached->map == cached);
-		*(struct kgem_bo **)cached->map = NULL;
-		cached->map = NULL;
+		assert(*(struct kgem_bo **)cached->map__gtt == cached);
+		*(struct kgem_bo **)cached->map__gtt = NULL;
+		cached->map__gtt = NULL;
 
 		kgem_bo_destroy(kgem, cached);
 	}
@@ -1877,6 +1910,10 @@ static bool kgem_retire__buffers(struct kgem *kgem)
 					struct kgem_buffer,
 					base.list);
 
+		DBG(("%s: handle=%d, busy? %d [%d]\n",
+		     __FUNCTION__, bo->base.handle, bo->base.rq != NULL, bo->base.exec != NULL));
+
+		assert(bo->base.exec == NULL || RQ(bo->base.rq) == kgem->next_request);
 		if (bo->base.rq)
 			break;
 
@@ -1897,7 +1934,7 @@ static bool kgem_retire__flushing(struct kgem *kgem)
 	bool retired = false;
 
 	list_for_each_entry_safe(bo, next, &kgem->flushing, request) {
-		assert(bo->rq == (void *)kgem);
+		assert(RQ(bo->rq) == (void *)kgem);
 		assert(bo->exec == NULL);
 
 		if (__kgem_busy(kgem, bo->handle))
@@ -1960,7 +1997,8 @@ static bool __kgem_retire_rq(struct kgem *kgem, struct kgem_request *rq)
 			DBG(("%s: moving %d to flushing\n",
 			     __FUNCTION__, bo->handle));
 			list_add(&bo->request, &kgem->flushing);
-			bo->rq = (void *)kgem;
+			bo->rq = MAKE_REQUEST(kgem, RQ_RING(bo->rq));
+			kgem->need_retire = true;
 			continue;
 		}
 
@@ -1985,6 +2023,7 @@ static bool __kgem_retire_rq(struct kgem *kgem, struct kgem_request *rq)
 	}
 
 	assert(rq->bo->rq == NULL);
+	assert(rq->bo->exec == NULL);
 	assert(list_is_empty(&rq->bo->request));
 
 	if (--rq->bo->refcnt == 0) {
@@ -2057,7 +2096,7 @@ bool kgem_retire(struct kgem *kgem)
 {
 	bool retired = false;
 
-	DBG(("%s\n", __FUNCTION__));
+	DBG(("%s, need_retire?=%d\n", __FUNCTION__, kgem->need_retire));
 
 	kgem->need_retire = false;
 
@@ -2077,6 +2116,7 @@ bool __kgem_ring_is_idle(struct kgem *kgem, int ring)
 {
 	struct kgem_request *rq;
 
+	assert(ring < ARRAY_SIZE(kgem->requests));
 	assert(!list_is_empty(&kgem->requests[ring]));
 
 	rq = list_last_entry(&kgem->requests[ring],
@@ -2091,10 +2131,24 @@ bool __kgem_ring_is_idle(struct kgem *kgem, int ring)
 	     __FUNCTION__, ring, rq->bo->handle));
 
 	kgem_retire__requests_ring(kgem, ring);
+	kgem_retire__buffers(kgem);
+
 	assert(list_is_empty(&kgem->requests[ring]));
 	return true;
 }
 
+#ifndef NDEBUG
+static void kgem_commit__check_buffers(struct kgem *kgem)
+{
+	struct kgem_buffer *bo;
+
+	list_for_each_entry(bo, &kgem->active_buffers, base.list)
+		assert(bo->base.exec == NULL);
+}
+#else
+#define kgem_commit__check_buffers(kgem)
+#endif
+
 static void kgem_commit(struct kgem *kgem)
 {
 	struct kgem_request *rq = kgem->next_request;
@@ -2118,6 +2172,7 @@ static void kgem_commit(struct kgem *kgem)
 
 		if (!bo->refcnt && !bo->reusable) {
 			assert(!bo->snoop);
+			assert(!bo->proxy);
 			kgem_bo_free(kgem, bo);
 			continue;
 		}
@@ -2128,7 +2183,6 @@ static void kgem_commit(struct kgem *kgem)
 
 		if (bo->proxy) {
 			/* proxies are not used for domain tracking */
-			bo->exec = NULL;
 			__kgem_bo_clear_busy(bo);
 		}
 
@@ -2152,7 +2206,8 @@ static void kgem_commit(struct kgem *kgem)
 		kgem_retire(kgem);
 		assert(list_is_empty(&rq->buffers));
 
-		assert(rq->bo->map == NULL);
+		assert(rq->bo->map__gtt == NULL);
+		assert(rq->bo->map__cpu == NULL);
 		gem_close(kgem->fd, rq->bo->handle);
 		kgem_cleanup_cache(kgem);
 	} else {
@@ -2161,6 +2216,8 @@ static void kgem_commit(struct kgem *kgem)
 	}
 
 	kgem->next_request = NULL;
+
+	kgem_commit__check_buffers(kgem);
 }
 
 static void kgem_close_list(struct kgem *kgem, struct list *head)
@@ -2182,17 +2239,18 @@ static void kgem_finish_buffers(struct kgem *kgem)
 	struct kgem_buffer *bo, *next;
 
 	list_for_each_entry_safe(bo, next, &kgem->batch_buffers, base.list) {
-		DBG(("%s: buffer handle=%d, used=%d, exec?=%d, write=%d, mmapped=%s\n",
+		DBG(("%s: buffer handle=%d, used=%d, exec?=%d, write=%d, mmapped=%s, refcnt=%d\n",
 		     __FUNCTION__, bo->base.handle, bo->used, bo->base.exec!=NULL,
-		     bo->write, bo->mmapped ? IS_CPU_MAP(bo->base.map) ? "cpu" : "gtt" : "no"));
+		     bo->write, bo->mmapped == MMAPPED_CPU ? "cpu" : bo->mmapped == MMAPPED_GTT ? "gtt" : "no",
+		     bo->base.refcnt));
 
 		assert(next->base.list.prev == &bo->base.list);
 		assert(bo->base.io);
 		assert(bo->base.refcnt >= 1);
 
-		if (!bo->base.exec) {
-			DBG(("%s: skipping unattached handle=%d, used=%d\n",
-			     __FUNCTION__, bo->base.handle, bo->used));
+		if (bo->base.refcnt > 1 && !bo->base.exec) {
+			DBG(("%s: skipping unattached handle=%d, used=%d, refcnt=%d\n",
+			     __FUNCTION__, bo->base.handle, bo->used, bo->base.refcnt));
 			continue;
 		}
 
@@ -2202,27 +2260,28 @@ static void kgem_finish_buffers(struct kgem *kgem)
 		}
 
 		if (bo->mmapped) {
-			int used;
+			uint32_t used;
 
 			assert(!bo->need_io);
 
 			used = ALIGN(bo->used, PAGE_SIZE);
 			if (!DBG_NO_UPLOAD_ACTIVE &&
 			    used + PAGE_SIZE <= bytes(&bo->base) &&
-			    (kgem->has_llc || !IS_CPU_MAP(bo->base.map) || bo->base.snoop)) {
-				DBG(("%s: retaining upload buffer (%d/%d)\n",
-				     __FUNCTION__, bo->used, bytes(&bo->base)));
+			    (kgem->has_llc || bo->mmapped == MMAPPED_GTT || bo->base.snoop)) {
+				DBG(("%s: retaining upload buffer (%d/%d): used=%d, refcnt=%d\n",
+				     __FUNCTION__, bo->used, bytes(&bo->base), used, bo->base.refcnt));
 				bo->used = used;
 				list_move(&bo->base.list,
 					  &kgem->active_buffers);
+				kgem->need_retire = true;
 				continue;
 			}
 			DBG(("%s: discarding mmapped buffer, used=%d, map type=%d\n",
-			     __FUNCTION__, bo->used, (int)__MAP_TYPE(bo->base.map)));
+			     __FUNCTION__, bo->used, bo->mmapped));
 			goto decouple;
 		}
 
-		if (!bo->used) {
+		if (!bo->used || !bo->base.exec) {
 			/* Unless we replace the handle in the execbuffer,
 			 * then this bo will become active. So decouple it
 			 * from the buffer list and track it in the normal
@@ -2301,7 +2360,7 @@ static void kgem_finish_buffers(struct kgem *kgem)
 				     bo->base.handle, shrink->handle));
 
 				assert(bo->used <= bytes(shrink));
-				if (gem_write(kgem->fd, shrink->handle,
+				if (gem_write__cachealigned(kgem->fd, shrink->handle,
 					      0, bo->used, bo->mem) == 0) {
 					shrink->target_handle =
 						kgem->has_handle_lut ? bo->base.target_handle : shrink->handle;
@@ -2340,7 +2399,7 @@ static void kgem_finish_buffers(struct kgem *kgem)
 		     __FUNCTION__, bo->base.handle, bo->used, bytes(&bo->base)));
 		ASSERT_IDLE(kgem, bo->base.handle);
 		assert(bo->used <= bytes(&bo->base));
-		gem_write(kgem->fd, bo->base.handle,
+		gem_write__cachealigned(kgem->fd, bo->base.handle,
 			  0, bo->used, bo->mem);
 		bo->need_io = 0;
 
@@ -2390,33 +2449,58 @@ static int kgem_batch_write(struct kgem *kgem, uint32_t handle, uint32_t size)
 
 	ASSERT_IDLE(kgem, handle);
 
+retry:
 	/* If there is no surface data, just upload the batch */
-	if (kgem->surface == kgem->batch_size)
-		return gem_write(kgem->fd, handle,
+	if (kgem->surface == kgem->batch_size) {
+		if (gem_write__cachealigned(kgem->fd, handle,
 				 0, sizeof(uint32_t)*kgem->nbatch,
-				 kgem->batch);
+					    kgem->batch) == 0)
+			return 0;
+
+		goto expire;
+	}
 
 	/* Are the batch pages conjoint with the surface pages? */
 	if (kgem->surface < kgem->nbatch + PAGE_SIZE/sizeof(uint32_t)) {
 		assert(size == PAGE_ALIGN(kgem->batch_size*sizeof(uint32_t)));
-		return gem_write(kgem->fd, handle,
+		if (gem_write__cachealigned(kgem->fd, handle,
 				 0, kgem->batch_size*sizeof(uint32_t),
-				 kgem->batch);
+					    kgem->batch) == 0)
+			return 0;
+
+		goto expire;
 	}
 
 	/* Disjoint surface/batch, upload separately */
-	ret = gem_write(kgem->fd, handle,
+	if (gem_write__cachealigned(kgem->fd, handle,
 			0, sizeof(uint32_t)*kgem->nbatch,
-			kgem->batch);
-	if (ret)
-		return ret;
+				    kgem->batch))
+		goto expire;
 
 	ret = PAGE_ALIGN(sizeof(uint32_t) * kgem->batch_size);
 	ret -= sizeof(uint32_t) * kgem->surface;
 	assert(size-ret >= kgem->nbatch*sizeof(uint32_t));
-	return __gem_write(kgem->fd, handle,
+	if (gem_write(kgem->fd, handle,
 			size - ret, (kgem->batch_size - kgem->surface)*sizeof(uint32_t),
-			kgem->batch + kgem->surface);
+		      kgem->batch + kgem->surface))
+		goto expire;
+
+	return 0;
+
+expire:
+	ret = errno;
+	assert(ret != EINVAL);
+
+	(void)__kgem_throttle_retire(kgem, 0);
+	if (kgem_expire_cache(kgem))
+		goto retry;
+
+	if (kgem_cleanup_cache(kgem))
+		goto retry;
+
+	ErrorF("%s: failed to write batch (handle=%d): %d\n",
+	       __FUNCTION__, handle, ret);
+	return ret;
 }
 
 void kgem_reset(struct kgem *kgem)
@@ -2442,6 +2526,7 @@ void kgem_reset(struct kgem *kgem)
 				assert(bo->domain == DOMAIN_GPU || bo->domain == DOMAIN_NONE);
 				list_add(&bo->request, &kgem->flushing);
 				bo->rq = (void *)kgem;
+				kgem->need_retire = true;
 			} else
 				__kgem_bo_clear_busy(bo);
 
@@ -2474,6 +2559,7 @@ void kgem_reset(struct kgem *kgem)
 	kgem->nreloc__self = 0;
 	kgem->aperture = 0;
 	kgem->aperture_fenced = 0;
+	kgem->aperture_max_fence = 0;
 	kgem->nbatch = 0;
 	kgem->surface = kgem->batch_size;
 	kgem->mode = KGEM_NONE;
@@ -2599,10 +2685,10 @@ void _kgem_submit(struct kgem *kgem)
 	batch_end = kgem_end_batch(kgem);
 	kgem_sna_flush(kgem);
 
-	DBG(("batch[%d/%d, flags=%x]: %d %d %d %d, nreloc=%d, nexec=%d, nfence=%d, aperture=%d\n",
+	DBG(("batch[%d/%d, flags=%x]: %d %d %d %d, nreloc=%d, nexec=%d, nfence=%d, aperture=%d [fenced=%d]\n",
 	     kgem->mode, kgem->ring, kgem->batch_flags,
 	     batch_end, kgem->nbatch, kgem->surface, kgem->batch_size,
-	     kgem->nreloc, kgem->nexec, kgem->nfence, kgem->aperture));
+	     kgem->nreloc, kgem->nexec, kgem->nfence, kgem->aperture, kgem->aperture_fenced));
 
 	assert(kgem->nbatch <= kgem->batch_size);
 	assert(kgem->nbatch <= kgem->surface);
@@ -2660,8 +2746,8 @@ void _kgem_submit(struct kgem *kgem)
             {
                 int fd = open("/tmp1/1/batchbuffer.bin", O_CREAT|O_WRONLY|O_BINARY);
 				if (fd != -1) {
-                    write(fd, kgem->batch, size);
-					close(fd);
+					ret = write(fd, kgem->batch, batch_end*sizeof(uint32_t));
+					fd = close(fd);
 				}
                 else printf("SNA: failed to write batchbuffer\n");
                 asm volatile("int3");
@@ -2694,9 +2780,9 @@ void _kgem_submit(struct kgem *kgem)
 
 #if 0
 				ret = errno;
-				ErrorF("batch[%d/%d]: %d %d %d, nreloc=%d, nexec=%d, nfence=%d, aperture=%d: errno=%d\n",
+				ErrorF("batch[%d/%d]: %d %d %d, nreloc=%d, nexec=%d, nfence=%d, aperture=%d, fenced=%d, high=%d,%d: errno=%d\n",
 				       kgem->mode, kgem->ring, batch_end, kgem->nbatch, kgem->surface,
-				       kgem->nreloc, kgem->nexec, kgem->nfence, kgem->aperture, errno);
+				       kgem->nreloc, kgem->nexec, kgem->nfence, kgem->aperture, kgem->aperture_fenced, kgem->aperture_high, kgem->aperture_total, errno);
 
 				for (i = 0; i < kgem->nexec; i++) {
 					struct kgem_bo *bo, *found = NULL;
@@ -2764,7 +2850,7 @@ void kgem_throttle(struct kgem *kgem)
 	}
 }
 
-void kgem_purge_cache(struct kgem *kgem)
+static void kgem_purge_cache(struct kgem *kgem)
 {
 	struct kgem_bo *bo, *next;
 	int i;
@@ -2892,7 +2978,7 @@ bool kgem_expire_cache(struct kgem *kgem)
 				break;
 			}
 
-			if (bo->map && bo->delta + MAP_PRESERVE_TIME > expire) {
+			if (bo->map__cpu && bo->delta + MAP_PRESERVE_TIME > expire) {
 				idle = false;
 				list_move_tail(&bo->list, &preserve);
 			} else {
@@ -2932,7 +3018,7 @@ bool kgem_expire_cache(struct kgem *kgem)
 	(void)size;
 }
 
-void kgem_cleanup_cache(struct kgem *kgem)
+bool kgem_cleanup_cache(struct kgem *kgem)
 {
 	unsigned int i;
 	int n;
@@ -2962,6 +3048,9 @@ void kgem_cleanup_cache(struct kgem *kgem)
 	kgem_retire(kgem);
 	kgem_cleanup(kgem);
 
+	if (!kgem->need_expire)
+		return false;
+
 	for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++) {
 		while (!list_is_empty(&kgem->inactive[i]))
 			kgem_bo_free(kgem,
@@ -2984,6 +3073,7 @@ void kgem_cleanup_cache(struct kgem *kgem)
 
 	kgem->need_purge = false;
 	kgem->need_expire = false;
+	return true;
 }
 
 static struct kgem_bo *
@@ -3028,8 +3118,10 @@ retry_large:
 				goto discard;
 
 			list_del(&bo->list);
-			if (bo->rq == (void *)kgem)
+			if (RQ(bo->rq) == (void *)kgem) {
+				assert(bo->exec == NULL);
 				list_del(&bo->request);
+			}
 
 			bo->delta = 0;
 			assert_tiling(kgem, bo);
@@ -3083,7 +3175,7 @@ discard:
 		     __FUNCTION__, for_cpu ? "cpu" : "gtt"));
 		cache = &kgem->vma[for_cpu].inactive[cache_bucket(num_pages)];
 		list_for_each_entry(bo, cache, vma) {
-			assert(IS_CPU_MAP(bo->map) == for_cpu);
+			assert(for_cpu ? bo->map__cpu : bo->map__gtt);
 			assert(bucket(bo) == cache_bucket(num_pages));
 			assert(bo->proxy == NULL);
 			assert(bo->rq == NULL);
@@ -3107,6 +3199,8 @@ discard:
 				continue;
 
 			kgem_bo_remove_from_inactive(kgem, bo);
+			assert(list_is_empty(&bo->vma));
+			assert(list_is_empty(&bo->list));
 
 			bo->tiling = I915_TILING_NONE;
 			bo->pitch = 0;
@@ -3163,10 +3257,10 @@ discard:
 			bo->pitch = 0;
 		}
 
-		if (bo->map) {
+		if (bo->map__gtt || bo->map__cpu) {
 			if (flags & (CREATE_CPU_MAP | CREATE_GTT_MAP)) {
 				int for_cpu = !!(flags & CREATE_CPU_MAP);
-				if (IS_CPU_MAP(bo->map) != for_cpu) {
+				if (for_cpu ? bo->map__cpu : bo->map__gtt){
 					if (first != NULL)
 						break;
 
@@ -3181,6 +3275,9 @@ discard:
 				continue;
 			}
 		} else {
+			if (flags & CREATE_GTT_MAP && !kgem_bo_can_map(kgem, bo))
+				continue;
+
 			if (flags & (CREATE_CPU_MAP | CREATE_GTT_MAP)) {
 				if (first != NULL)
 					break;
@@ -3202,6 +3299,7 @@ discard:
 		     __FUNCTION__, bo->handle, num_pages(bo),
 		     use_active ? "active" : "inactive"));
 		assert(list_is_empty(&bo->list));
+		assert(list_is_empty(&bo->vma));
 		assert(use_active || bo->domain != DOMAIN_GPU);
 		assert(!bo->needs_flush || use_active);
 		assert_tiling(kgem, bo);
@@ -3223,6 +3321,7 @@ discard:
 		     __FUNCTION__, first->handle, num_pages(first),
 		     use_active ? "active" : "inactive"));
 		assert(list_is_empty(&first->list));
+		assert(list_is_empty(&first->vma));
 		assert(use_active || first->domain != DOMAIN_GPU);
 		assert(!first->needs_flush || use_active);
 		ASSERT_MAYBE_IDLE(kgem, first->handle, !use_active);
@@ -3282,11 +3381,11 @@ inline int kgem_bo_fenced_size(struct kgem *kgem, struct kgem_bo *bo)
 	assert(kgem->gen < 040);
 
 	if (kgem->gen < 030)
-		size = 512 * 1024;
+		size = 512 * 1024 / PAGE_SIZE;
 	else
-		size = 1024 * 1024;
-	while (size < bytes(bo))
-		size *= 2;
+		size = 1024 * 1024 / PAGE_SIZE;
+	while (size < num_pages(bo))
+		size <<= 1;
 
 	return size;
 }
@@ -3308,7 +3407,6 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 	if (tiling < 0)
 		exact = true, tiling = -tiling;
 
-
 	DBG(("%s(%dx%d, bpp=%d, tiling=%d, exact=%d, inactive=%d, cpu-mapping=%d, gtt-mapping=%d, scanout?=%d, prime?=%d, temp?=%d)\n", __FUNCTION__,
 	     width, height, bpp, tiling, exact,
 	     !!(flags & CREATE_INACTIVE),
@@ -3324,61 +3422,6 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 	size /= PAGE_SIZE;
 	bucket = cache_bucket(size);
 
-	if (flags & CREATE_SCANOUT) {
-		struct kgem_bo *last = NULL;
-
-		list_for_each_entry_reverse(bo, &kgem->scanout, list) {
-			assert(bo->scanout);
-			assert(bo->delta);
-			assert(!bo->flush);
-			assert_tiling(kgem, bo);
-
-			if (size > num_pages(bo) || num_pages(bo) > 2*size)
-				continue;
-
-			if (bo->tiling != tiling ||
-			    (tiling != I915_TILING_NONE && bo->pitch != pitch)) {
-				if (!gem_set_tiling(kgem->fd, bo->handle,
-						    tiling, pitch))
-					continue;
-
-				bo->tiling = tiling;
-				bo->pitch = pitch;
-			}
-
-			if (flags & CREATE_INACTIVE && bo->rq) {
-				last = bo;
-				continue;
-			}
-
-			list_del(&bo->list);
-
-			bo->unique_id = kgem_get_unique_id(kgem);
-			DBG(("  1:from scanout: pitch=%d, tiling=%d, handle=%d, id=%d\n",
-			     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
-			assert(bo->pitch*kgem_aligned_height(kgem, height, bo->tiling) <= kgem_bo_size(bo));
-			assert_tiling(kgem, bo);
-			bo->refcnt = 1;
-			return bo;
-		}
-
-		if (last) {
-			list_del(&last->list);
-
-			last->unique_id = kgem_get_unique_id(kgem);
-			DBG(("  1:from scanout: pitch=%d, tiling=%d, handle=%d, id=%d\n",
-			     last->pitch, last->tiling, last->handle, last->unique_id));
-			assert(last->pitch*kgem_aligned_height(kgem, height, last->tiling) <= kgem_bo_size(last));
-			assert_tiling(kgem, last);
-			last->refcnt = 1;
-			return last;
-		}
-
-		bo = NULL; //__kgem_bo_create_as_display(kgem, size, tiling, pitch);
-		if (bo)
-			return bo;
-	}
-
 	if (bucket >= NUM_CACHE_BUCKETS) {
 		DBG(("%s: large bo num pages=%d, bucket=%d\n",
 		     __FUNCTION__, size, bucket));
@@ -3428,7 +3471,6 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 			assert(bo->pitch*kgem_aligned_height(kgem, height, bo->tiling) <= kgem_bo_size(bo));
 			assert_tiling(kgem, bo);
 			bo->refcnt = 1;
-			bo->flush = true;
 			return bo;
 		}
 
@@ -3488,9 +3530,9 @@ large_inactive:
 				assert(bucket(bo) == bucket);
 				assert(bo->refcnt == 0);
 				assert(!bo->scanout);
-				assert(bo->map);
-				assert(IS_CPU_MAP(bo->map) == for_cpu);
+				assert(for_cpu ? bo->map__cpu : bo->map__gtt);
 				assert(bo->rq == NULL);
+				assert(bo->exec == NULL);
 				assert(list_is_empty(&bo->request));
 				assert(bo->flush == false);
 				assert_tiling(kgem, bo);
@@ -3520,6 +3562,8 @@ large_inactive:
 				bo->domain = DOMAIN_NONE;
 
 				kgem_bo_remove_from_inactive(kgem, bo);
+				assert(list_is_empty(&bo->list));
+				assert(list_is_empty(&bo->vma));
 
 				DBG(("  from inactive vma: pitch=%d, tiling=%d: handle=%d, id=%d\n",
 				     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
@@ -3740,9 +3784,6 @@ search_inactive:
 			if (!gem_set_tiling(kgem->fd, bo->handle,
 					    tiling, pitch))
 				continue;
-
-			if (bo->map)
-				kgem_bo_release_map(kgem, bo);
 		}
 
 		if (bo->purged && !kgem_bo_clear_purgeable(kgem, bo)) {
@@ -3751,6 +3792,8 @@ search_inactive:
 		}
 
 		kgem_bo_remove_from_inactive(kgem, bo);
+		assert(list_is_empty(&bo->list));
+		assert(list_is_empty(&bo->vma));
 
 		bo->pitch = pitch;
 		bo->tiling = tiling;
@@ -3799,12 +3842,6 @@ create:
 		return NULL;
 	}
 
-	if (bucket >= NUM_CACHE_BUCKETS) {
-		DBG(("%s: marking large bo for automatic flushing\n",
-		     __FUNCTION__));
-		bo->flush = true;
-	}
-
 	bo->unique_id = kgem_get_unique_id(kgem);
 	if (tiling == I915_TILING_NONE ||
 	    gem_set_tiling(kgem->fd, handle, tiling, pitch)) {
@@ -3935,16 +3972,21 @@ void _kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 	     __FUNCTION__, bo->handle, bo->proxy != NULL));
 
 	if (bo->proxy) {
+		assert(!bo->reusable);
+		kgem_bo_binding_free(kgem, bo);
+
+		assert(list_is_empty(&bo->list));
 		_list_del(&bo->vma);
 		_list_del(&bo->request);
-		if (bo->io && bo->exec == NULL)
-			_kgem_bo_delete_buffer(kgem, bo);
-		kgem_bo_unref(kgem, bo->proxy);
-		kgem_bo_binding_free(kgem, bo);
-		free(bo);
-		return;
-		}
 
+		if (bo->io && bo->domain == DOMAIN_CPU)
+			_kgem_bo_delete_buffer(kgem, bo);
+
+		kgem_bo_unref(kgem, bo->proxy);
+
+		*(struct kgem_bo **)bo = __kgem_freed_bo;
+		__kgem_freed_bo = bo;
+	} else
 	__kgem_bo_destroy(kgem, bo);
 }
 
@@ -3989,6 +4031,58 @@ inline static bool needs_semaphore(struct kgem *kgem, struct kgem_bo *bo)
 	return kgem->nreloc && bo->rq && RQ_RING(bo->rq) != kgem->ring;
 }
 
+static bool aperture_check(struct kgem *kgem, unsigned num_pages)
+{
+	if (kgem->aperture) {
+		struct drm_i915_gem_get_aperture aperture;
+
+		VG_CLEAR(aperture);
+		aperture.aper_available_size = kgem->aperture_high;
+		aperture.aper_available_size *= PAGE_SIZE;
+		(void)drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture);
+
+		DBG(("%s: aperture required %ld bytes, available %ld bytes\n",
+		     __FUNCTION__,
+		     (long)num_pages * PAGE_SIZE,
+		     (long)aperture.aper_available_size));
+
+		/* Leave some space in case of alignment issues */
+		aperture.aper_available_size -= 1024 * 1024;
+		aperture.aper_available_size -= kgem->aperture_mappable * PAGE_SIZE / 2;
+		if (kgem->gen < 033)
+			aperture.aper_available_size -= kgem->aperture_max_fence * PAGE_SIZE;
+		if (!kgem->has_llc)
+			aperture.aper_available_size -= 2 * kgem->nexec * PAGE_SIZE;
+
+		DBG(("%s: num_pages=%d, estimated max usable=%ld\n",
+		     __FUNCTION__, num_pages, (long)(aperture.aper_available_size/PAGE_SIZE)));
+
+		if (num_pages <= aperture.aper_available_size / PAGE_SIZE)
+			return true;
+	}
+
+	return false;
+}
+
+static inline bool kgem_flush(struct kgem *kgem, bool flush)
+{
+	if (unlikely(kgem->wedged))
+		return false;
+
+	if (kgem->nreloc == 0)
+		return true;
+
+	if (container_of(kgem, struct sna, kgem)->flags & SNA_POWERSAVE)
+		return true;
+
+	if (kgem->flush == flush && kgem->aperture < kgem->aperture_low)
+		return true;
+
+	DBG(("%s: opportunistic flushing? flush=%d,%d, aperture=%d/%d, idle?=%d\n",
+	     __FUNCTION__, kgem->flush, flush, kgem->aperture, kgem->aperture_low, kgem_ring_is_idle(kgem, kgem->ring)));
+	return !kgem_ring_is_idle(kgem, kgem->ring);
+}
+
 bool kgem_check_bo(struct kgem *kgem, ...)
 {
 	va_list ap;
@@ -3996,6 +4090,7 @@ bool kgem_check_bo(struct kgem *kgem, ...)
 	int num_exec = 0;
 	int num_pages = 0;
 	bool flush = false;
+	bool busy = true;
 
 	va_start(ap, kgem);
 	while ((bo = va_arg(ap, struct kgem_bo *))) {
@@ -4004,13 +4099,16 @@ bool kgem_check_bo(struct kgem *kgem, ...)
 		if (bo->exec)
 			continue;
 
-		if (needs_semaphore(kgem, bo))
+		if (needs_semaphore(kgem, bo)) {
+			DBG(("%s: flushing for required semaphore\n", __FUNCTION__));
 			return false;
+		}
 
 		num_pages += num_pages(bo);
 		num_exec++;
 
 		flush |= bo->flush;
+		busy &= bo->rq != NULL;
 	}
 	va_end(ap);
 
@@ -4020,43 +4118,129 @@ bool kgem_check_bo(struct kgem *kgem, ...)
 	if (!num_pages)
 		return true;
 
-	if (kgem_flush(kgem, flush))
-		return false;
-
-	if (kgem->aperture > kgem->aperture_low &&
-	    kgem_ring_is_idle(kgem, kgem->ring)) {
-		DBG(("%s: current aperture usage (%d) is greater than low water mark (%d)\n",
-		     __FUNCTION__, kgem->aperture, kgem->aperture_low));
-		return false;
-	}
-
-	if (num_pages + kgem->aperture > kgem->aperture_high) {
-		DBG(("%s: final aperture usage (%d) is greater than high water mark (%d)\n",
-		     __FUNCTION__, num_pages + kgem->aperture, kgem->aperture_high));
-		return false;
-	}
-
 	if (kgem->nexec + num_exec >= KGEM_EXEC_SIZE(kgem)) {
 		DBG(("%s: out of exec slots (%d + %d / %d)\n", __FUNCTION__,
 		     kgem->nexec, num_exec, KGEM_EXEC_SIZE(kgem)));
 		return false;
 	}
 
-	return true;
+	if (num_pages + kgem->aperture > kgem->aperture_high) {
+		DBG(("%s: final aperture usage (%d) is greater than high water mark (%d)\n",
+		     __FUNCTION__, num_pages + kgem->aperture, kgem->aperture_high));
+		if (!aperture_check(kgem, num_pages + kgem->aperture))
+		return false;
+	}
+
+	if (busy)
+		return true;
+
+	return kgem_flush(kgem, flush);
 }
 
+#if 0
+bool kgem_check_bo_fenced(struct kgem *kgem, struct kgem_bo *bo)
+{
+	assert(bo->refcnt);
+	while (bo->proxy)
+		bo = bo->proxy;
+	assert(bo->refcnt);
 
+	if (bo->exec) {
+		if (kgem->gen < 040 &&
+		    bo->tiling != I915_TILING_NONE &&
+		    (bo->exec->flags & EXEC_OBJECT_NEEDS_FENCE) == 0) {
+			uint32_t size;
 
+			assert(bo->tiling == I915_TILING_X);
 
+			if (kgem->nfence >= kgem->fence_max)
+				return false;
 
+			if (kgem->aperture_fenced) {
+				size = 3*kgem->aperture_fenced;
+				if (kgem->aperture_total == kgem->aperture_mappable)
+					size += kgem->aperture;
+				if (size > kgem->aperture_mappable &&
+				    kgem_ring_is_idle(kgem, kgem->ring)) {
+					DBG(("%s: opportunistic fence flush\n", __FUNCTION__));
+					return false;
+				}
+			}
 
+			size = kgem_bo_fenced_size(kgem, bo);
+			if (size > kgem->aperture_max_fence)
+				kgem->aperture_max_fence = size;
+			size += kgem->aperture_fenced;
+			if (kgem->gen < 033)
+				size += kgem->aperture_max_fence;
+			if (kgem->aperture_total == kgem->aperture_mappable)
+				size += kgem->aperture;
+			if (size > kgem->aperture_mappable) {
+				DBG(("%s: estimated fence space required [%d] exceed aperture [%d]\n",
+				     __FUNCTION__, size, kgem->aperture_mappable));
+				return false;
+			}
+		}
 
+		return true;
+	}
 
+	if (kgem->nexec >= KGEM_EXEC_SIZE(kgem) - 1)
+		return false;
 
+	if (needs_semaphore(kgem, bo)) {
+		DBG(("%s: flushing for required semaphore\n", __FUNCTION__));
+		return false;
+	}
 
+	assert_tiling(kgem, bo);
+	if (kgem->gen < 040 && bo->tiling != I915_TILING_NONE) {
+		uint32_t size;
 
+		assert(bo->tiling == I915_TILING_X);
 
+		if (kgem->nfence >= kgem->fence_max)
+			return false;
 
+		if (kgem->aperture_fenced) {
+			size = 3*kgem->aperture_fenced;
+			if (kgem->aperture_total == kgem->aperture_mappable)
+				size += kgem->aperture;
+			if (size > kgem->aperture_mappable &&
+			    kgem_ring_is_idle(kgem, kgem->ring)) {
+				DBG(("%s: opportunistic fence flush\n", __FUNCTION__));
+				return false;
+			}
+		}
+
+		size = kgem_bo_fenced_size(kgem, bo);
+		if (size > kgem->aperture_max_fence)
+			kgem->aperture_max_fence = size;
+		size += kgem->aperture_fenced;
+		if (kgem->gen < 033)
+			size += kgem->aperture_max_fence;
+		if (kgem->aperture_total == kgem->aperture_mappable)
+			size += kgem->aperture;
+		if (size > kgem->aperture_mappable) {
+			DBG(("%s: estimated fence space required [%d] exceed aperture [%d]\n",
+			     __FUNCTION__, size, kgem->aperture_mappable));
+			return false;
+		}
+	}
+
+	if (kgem->aperture + kgem->aperture_fenced + num_pages(bo) > kgem->aperture_high) {
+		DBG(("%s: final aperture usage (%d) is greater than high water mark (%d)\n",
+		     __FUNCTION__, num_pages(bo) + kgem->aperture, kgem->aperture_high));
+		if (!aperture_check(kgem, num_pages(bo) + kgem->aperture + kgem->aperture_fenced))
+			return false;
+	}
+
+	if (bo->rq)
+		return true;
+
+	return kgem_flush(kgem, bo->flush);
+}
+#endif
 
 
 
@@ -4085,23 +4269,14 @@ uint32_t kgem_add_reloc(struct kgem *kgem,
 	DBG(("%s: handle=%d, pos=%d, delta=%d, domains=%08x\n",
 	     __FUNCTION__, bo ? bo->handle : 0, pos, delta, read_write_domain));
 
+	assert(kgem->gen < 0100);
 	assert((read_write_domain & 0x7fff) == 0 || bo != NULL);
 
-    if( bo != NULL && bo->handle == -2)
-    {
-   		if (bo->exec == NULL)
-			kgem_add_bo(kgem, bo);
-
-		if (read_write_domain & 0x7fff && !bo->gpu_dirty) {
-			__kgem_bo_mark_dirty(bo);
-		}
-        return 0;
-    };
-
 	index = kgem->nreloc++;
 	assert(index < ARRAY_SIZE(kgem->reloc));
 	kgem->reloc[index].offset = pos * sizeof(kgem->batch[0]);
 	if (bo) {
+		assert(kgem->mode != KGEM_NONE);
 		assert(bo->refcnt);
 		while (bo->proxy) {
 			DBG(("%s: adding proxy [delta=%d] for handle=%d\n",
@@ -4115,6 +4290,7 @@ uint32_t kgem_add_reloc(struct kgem *kgem,
 				bo->rq = MAKE_REQUEST(kgem->next_request,
 						      kgem->ring);
 				bo->exec = &_kgem_dummy_exec;
+				bo->domain = DOMAIN_GPU;
 		}
 
 			if (read_write_domain & 0x7fff && !bo->gpu_dirty)
@@ -4133,6 +4309,7 @@ uint32_t kgem_add_reloc(struct kgem *kgem,
 		if (kgem->gen < 040 && read_write_domain & KGEM_RELOC_FENCED) {
 			if (bo->tiling &&
 			    (bo->exec->flags & EXEC_OBJECT_NEEDS_FENCE) == 0) {
+				assert(bo->tiling == I915_TILING_X);
 				assert(kgem->nfence < kgem->fence_max);
 				kgem->aperture_fenced +=
 					kgem_bo_fenced_size(kgem, bo);
@@ -4164,6 +4341,77 @@ uint32_t kgem_add_reloc(struct kgem *kgem,
 	return delta;
 }
 
+uint64_t kgem_add_reloc64(struct kgem *kgem,
+			  uint32_t pos,
+			  struct kgem_bo *bo,
+			  uint32_t read_write_domain,
+			  uint64_t delta)
+{
+	int index;
+
+	DBG(("%s: handle=%d, pos=%d, delta=%ld, domains=%08x\n",
+	     __FUNCTION__, bo ? bo->handle : 0, pos, (long)delta, read_write_domain));
+
+	assert(kgem->gen >= 0100);
+	assert((read_write_domain & 0x7fff) == 0 || bo != NULL);
+
+	index = kgem->nreloc++;
+	assert(index < ARRAY_SIZE(kgem->reloc));
+	kgem->reloc[index].offset = pos * sizeof(kgem->batch[0]);
+	if (bo) {
+		assert(kgem->mode != KGEM_NONE);
+		assert(bo->refcnt);
+		while (bo->proxy) {
+			DBG(("%s: adding proxy [delta=%ld] for handle=%d\n",
+			     __FUNCTION__, (long)bo->delta, bo->handle));
+			delta += bo->delta;
+			assert(bo->handle == bo->proxy->handle);
+			/* need to release the cache upon batch submit */
+			if (bo->exec == NULL) {
+				list_move_tail(&bo->request,
+					       &kgem->next_request->buffers);
+				bo->rq = MAKE_REQUEST(kgem->next_request,
+						      kgem->ring);
+				bo->exec = &_kgem_dummy_exec;
+				bo->domain = DOMAIN_GPU;
+			}
+
+			if (read_write_domain & 0x7fff && !bo->gpu_dirty)
+				__kgem_bo_mark_dirty(bo);
+
+			bo = bo->proxy;
+			assert(bo->refcnt);
+		}
+		assert(bo->refcnt);
+
+		if (bo->exec == NULL)
+			kgem_add_bo(kgem, bo);
+		assert(bo->rq == MAKE_REQUEST(kgem->next_request, kgem->ring));
+		assert(RQ_RING(bo->rq) == kgem->ring);
+
+		kgem->reloc[index].delta = delta;
+		kgem->reloc[index].target_handle = bo->target_handle;
+		kgem->reloc[index].presumed_offset = bo->presumed_offset;
+
+		if (read_write_domain & 0x7fff && !bo->gpu_dirty) {
+			assert(!bo->snoop || kgem->can_blt_cpu);
+			__kgem_bo_mark_dirty(bo);
+		}
+
+		delta += bo->presumed_offset;
+	} else {
+		kgem->reloc[index].delta = delta;
+		kgem->reloc[index].target_handle = ~0U;
+		kgem->reloc[index].presumed_offset = 0;
+		if (kgem->nreloc__self < 256)
+			kgem->reloc__self[kgem->nreloc__self++] = index;
+	}
+	kgem->reloc[index].read_domains = read_write_domain >> 16;
+	kgem->reloc[index].write_domain = read_write_domain & 0x7fff;
+
+	return delta;
+}
+
 static void kgem_trim_vma_cache(struct kgem *kgem, int type, int bucket)
 {
 	int i, j;
@@ -4186,6 +4434,7 @@ static void kgem_trim_vma_cache(struct kgem *kgem, int type, int bucket)
 	i = 0;
 	while (kgem->vma[type].count > 0) {
 		struct kgem_bo *bo = NULL;
+		void **ptr;
 
 		for (j = 0;
 		     bo == NULL && j < ARRAY_SIZE(kgem->vma[type].inactive);
@@ -4198,15 +4447,14 @@ static void kgem_trim_vma_cache(struct kgem *kgem, int type, int bucket)
 			break;
 
 		DBG(("%s: discarding inactive %s vma cache for %d\n",
-		     __FUNCTION__,
-		     IS_CPU_MAP(bo->map) ? "CPU" : "GTT", bo->handle));
-		assert(IS_CPU_MAP(bo->map) == type);
-		assert(bo->map);
+		     __FUNCTION__, type ? "CPU" : "GTT", bo->handle));
+
+		ptr = type ? &bo->map__cpu : &bo->map__gtt;
 			assert(bo->rq == NULL);
 
-		VG(if (type) VALGRIND_MAKE_MEM_NOACCESS(MAP(bo->map), bytes(bo)));
-//		munmap(MAP(bo->map), bytes(bo));
-		bo->map = NULL;
+		VG(if (type) VALGRIND_MAKE_MEM_NOACCESS(MAP(*ptr), bytes(bo)));
+//		munmap(MAP(*ptr), bytes(bo));
+		*ptr = NULL;
 		list_del(&bo->vma);
 		kgem->vma[type].count--;
 
@@ -4222,12 +4470,11 @@ void *kgem_bo_map__async(struct kgem *kgem, struct kgem_bo *bo)
 {
 	void *ptr;
 
-	DBG(("%s: handle=%d, offset=%d, tiling=%d, map=%p, domain=%d\n", __FUNCTION__,
-	     bo->handle, bo->presumed_offset, bo->tiling, bo->map, bo->domain));
+	DBG(("%s: handle=%d, offset=%ld, tiling=%d, map=%p:%p, domain=%d\n", __FUNCTION__,
+	     bo->handle, (long)bo->presumed_offset, bo->tiling, bo->map__gtt, bo->map__cpu, bo->domain));
 
 	assert(bo->proxy == NULL);
 	assert(list_is_empty(&bo->list));
-	assert(!IS_USER_MAP(bo->map));
 	assert_tiling(kgem, bo);
 
 	if (bo->tiling == I915_TILING_NONE && !bo->scanout && kgem->has_llc) {
@@ -4236,12 +4483,9 @@ void *kgem_bo_map__async(struct kgem *kgem, struct kgem_bo *bo)
 		return kgem_bo_map__cpu(kgem, bo);
 	}
 
-	if (IS_CPU_MAP(bo->map))
-		kgem_bo_release_map(kgem, bo);
-
-	ptr = bo->map;
+	ptr = MAP(bo->map__gtt);
 	if (ptr == NULL) {
-		assert(kgem_bo_size(bo) <= kgem->aperture_mappable / 2);
+		assert(num_pages(bo) <= kgem->aperture_mappable / 2);
 
 		kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
 
@@ -4254,7 +4498,7 @@ void *kgem_bo_map__async(struct kgem *kgem, struct kgem_bo *bo)
 		 * issue with compositing managers which need to frequently
 		 * flush CPU damage to their GPU bo.
 		 */
-		bo->map = ptr;
+		bo->map__gtt = ptr;
 		DBG(("%s: caching GTT vma for %d\n", __FUNCTION__, bo->handle));
 	}
 
@@ -4265,12 +4509,11 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo)
 {
 	void *ptr;
 
-	DBG(("%s: handle=%d, offset=%d, tiling=%d, map=%p, domain=%d\n", __FUNCTION__,
-	     bo->handle, bo->presumed_offset, bo->tiling, bo->map, bo->domain));
+	DBG(("%s: handle=%d, offset=%ld, tiling=%d, map=%p:%p, domain=%d\n", __FUNCTION__,
+	     bo->handle, (long)bo->presumed_offset, bo->tiling, bo->map__gtt, bo->map__cpu, bo->domain));
 
 	assert(bo->proxy == NULL);
 	assert(list_is_empty(&bo->list));
-	assert(!IS_USER_MAP(bo->map));
 	assert(bo->exec == NULL);
 	assert_tiling(kgem, bo);
 
@@ -4284,12 +4527,9 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo)
 		return ptr;
 	}
 
-	if (IS_CPU_MAP(bo->map))
-		kgem_bo_release_map(kgem, bo);
-
-	ptr = bo->map;
+	ptr = MAP(bo->map__gtt);
 	if (ptr == NULL) {
-		assert(kgem_bo_size(bo) <= kgem->aperture_mappable / 2);
+		assert(num_pages(bo) <= kgem->aperture_mappable / 2);
 		assert(kgem->gen != 021 || bo->tiling != I915_TILING_Y);
 
 		kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
@@ -4303,7 +4543,7 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo)
 		 * issue with compositing managers which need to frequently
 		 * flush CPU damage to their GPU bo.
 		 */
-		bo->map = ptr;
+		bo->map__gtt = ptr;
 		DBG(("%s: caching GTT vma for %d\n", __FUNCTION__, bo->handle));
 		}
 
@@ -4333,20 +4573,16 @@ void *kgem_bo_map__gtt(struct kgem *kgem, struct kgem_bo *bo)
 {
 	void *ptr;
 
-	DBG(("%s: handle=%d, offset=%d, tiling=%d, map=%p, domain=%d\n", __FUNCTION__,
-	     bo->handle, bo->presumed_offset, bo->tiling, bo->map, bo->domain));
+	DBG(("%s: handle=%d, offset=%ld, tiling=%d, map=%p:%p, domain=%d\n", __FUNCTION__,
+	     bo->handle, (long)bo->presumed_offset, bo->tiling, bo->map__gtt, bo->map__cpu, bo->domain));
 
 	assert(bo->exec == NULL);
 	assert(list_is_empty(&bo->list));
-	assert(!IS_USER_MAP(bo->map));
 	assert_tiling(kgem, bo);
 
-	if (IS_CPU_MAP(bo->map))
-		kgem_bo_release_map(kgem, bo);
-
-	ptr = bo->map;
+	ptr = MAP(bo->map__gtt);
 	if (ptr == NULL) {
-		assert(bytes(bo) <= kgem->aperture_mappable / 4);
+		assert(num_pages(bo) <= kgem->aperture_mappable / 4);
 
 		kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
 
@@ -4359,7 +4595,7 @@ void *kgem_bo_map__gtt(struct kgem *kgem, struct kgem_bo *bo)
 		 * issue with compositing managers which need to frequently
 		 * flush CPU damage to their GPU bo.
 		 */
-		bo->map = ptr;
+		bo->map__gtt = ptr;
 		DBG(("%s: caching GTT vma for %d\n", __FUNCTION__, bo->handle));
 	}
 
@@ -4368,28 +4604,21 @@ void *kgem_bo_map__gtt(struct kgem *kgem, struct kgem_bo *bo)
 
 void *kgem_bo_map__debug(struct kgem *kgem, struct kgem_bo *bo)
 {
-	if (bo->map)
-		return MAP(bo->map);
-
-	kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
-	return bo->map = __kgem_bo_map__gtt(kgem, bo);
+	return kgem_bo_map__async(kgem, bo);
 }
 
 void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo)
 {
 	struct drm_i915_gem_mmap mmap_arg;
 
-	DBG(("%s(handle=%d, size=%d, mapped? %d)\n",
-	     __FUNCTION__, bo->handle, bytes(bo), (int)__MAP_TYPE(bo->map)));
+	DBG(("%s(handle=%d, size=%d, map=%p:%p)\n",
+	     __FUNCTION__, bo->handle, bytes(bo), bo->map__gtt, bo->map__cpu));
 	assert(!bo->purged);
 	assert(list_is_empty(&bo->list));
 	assert(bo->proxy == NULL);
 
-	if (IS_CPU_MAP(bo->map))
-		return MAP(bo->map);
-
-	if (bo->map)
-		kgem_bo_release_map(kgem, bo);
+	if (bo->map__cpu)
+		return MAP(bo->map__cpu);
 
 	kgem_trim_vma_cache(kgem, MAP_CPU, bucket(bo));
 
@@ -4399,58 +4628,14 @@ retry:
 	mmap_arg.offset = 0;
 	mmap_arg.size = bytes(bo);
 	if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg)) {
+		int err = 0;
+
 
 		if (__kgem_throttle_retire(kgem, 0))
 			goto retry;
 
-		if (kgem->need_expire) {
-			kgem_cleanup_cache(kgem);
+		if (kgem_cleanup_cache(kgem))
 			goto retry;
-		}
-
-		ErrorF("%s: failed to mmap handle=%d, %d bytes, into CPU domain\n",
-		       __FUNCTION__, bo->handle, bytes(bo));
-		return NULL;
-	}
-
-	VG(VALGRIND_MAKE_MEM_DEFINED(mmap_arg.addr_ptr, bytes(bo)));
-
-	DBG(("%s: caching CPU vma for %d\n", __FUNCTION__, bo->handle));
-	bo->map = MAKE_CPU_MAP(mmap_arg.addr_ptr);
-	return (void *)(uintptr_t)mmap_arg.addr_ptr;
-}
-
-void *__kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo)
-{
-	struct drm_i915_gem_mmap mmap_arg;
-
-	DBG(("%s(handle=%d, size=%d, mapped? %d)\n",
-	     __FUNCTION__, bo->handle, bytes(bo), (int)__MAP_TYPE(bo->map)));
-        assert(bo->refcnt);
-	assert(!bo->purged);
-	assert(list_is_empty(&bo->list));
-	assert(bo->proxy == NULL);
-
-	if (IS_CPU_MAP(bo->map))
-		return MAP(bo->map);
-
-retry:
-	VG_CLEAR(mmap_arg);
-	mmap_arg.handle = bo->handle;
-	mmap_arg.offset = 0;
-	mmap_arg.size = bytes(bo);
-	if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg)) {
-		int err = errno;
-
-		assert(err != EINVAL);
-
-		if (__kgem_throttle_retire(kgem, 0))
-			goto retry;
-
-		if (kgem->need_expire) {
-			kgem_cleanup_cache(kgem);
-			goto retry;
-		}
 
 		ErrorF("%s: failed to mmap handle=%d, %d bytes, into CPU domain: %d\n",
 		       __FUNCTION__, bo->handle, bytes(bo), err);
@@ -4458,16 +4643,68 @@ retry:
 	}
 
 	VG(VALGRIND_MAKE_MEM_DEFINED(mmap_arg.addr_ptr, bytes(bo)));
-	if (bo->map && bo->domain == DOMAIN_CPU) {
-		DBG(("%s: discarding GTT vma for %d\n", __FUNCTION__, bo->handle));
-		kgem_bo_release_map(kgem, bo);
-	}
-	if (bo->map == NULL) {
-		DBG(("%s: caching CPU vma for %d\n", __FUNCTION__, bo->handle));
-		bo->map = MAKE_CPU_MAP(mmap_arg.addr_ptr);
-	}
-	return (void *)(uintptr_t)mmap_arg.addr_ptr;
+
+	DBG(("%s: caching CPU vma for %d\n", __FUNCTION__, bo->handle));
+	return bo->map__cpu = (void *)(uintptr_t)mmap_arg.addr_ptr;
 }
+
+
+/*
+struct kgem_bo *kgem_create_map(struct kgem *kgem,
+				void *ptr, uint32_t size,
+				bool read_only)
+{
+	struct kgem_bo *bo;
+	uintptr_t first_page, last_page;
+	uint32_t handle;
+
+	assert(MAP(ptr) == ptr);
+
+	if (!kgem->has_userptr)
+		return NULL;
+
+	first_page = (uintptr_t)ptr;
+	last_page = first_page + size + PAGE_SIZE - 1;
+
+	first_page &= ~(PAGE_SIZE-1);
+	last_page &= ~(PAGE_SIZE-1);
+	assert(last_page > first_page);
+
+	handle = gem_userptr(kgem->fd,
+			     (void *)first_page, last_page-first_page,
+			     read_only);
+	if (handle == 0)
+		return NULL;
+
+	bo = __kgem_bo_alloc(handle, (last_page - first_page) / PAGE_SIZE);
+	if (bo == NULL) {
+		gem_close(kgem->fd, handle);
+		return NULL;
+	}
+
+	bo->snoop = !kgem->has_llc;
+	debug_alloc__bo(kgem, bo);
+
+	if (first_page != (uintptr_t)ptr) {
+		struct kgem_bo *proxy;
+
+		proxy = kgem_create_proxy(kgem, bo,
+					  (uintptr_t)ptr - first_page, size);
+		kgem_bo_destroy(kgem, bo);
+		if (proxy == NULL)
+		return NULL;
+
+		bo = proxy;
+	}
+
+	bo->map__cpu = MAKE_USER_MAP(ptr);
+
+	DBG(("%s(ptr=%p, size=%d, pages=%d, read_only=%d) => handle=%d (proxy? %d)\n",
+	     __FUNCTION__, ptr, size, NUM_PAGES(size), read_only, handle, bo->proxy != NULL));
+	return bo;
+}
+*/
+
 void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo)
 {
 	DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
@@ -4500,6 +4737,72 @@ void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo)
 	}
 }
 
+void kgem_bo_sync__cpu_full(struct kgem *kgem, struct kgem_bo *bo, bool write)
+{
+	DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
+	assert(!bo->scanout || !write);
+
+	if (write || bo->needs_flush)
+		kgem_bo_submit(kgem, bo);
+
+	/* SHM pixmaps use proxies for subpage offsets */
+	assert(!bo->purged);
+	assert(bo->refcnt);
+	while (bo->proxy)
+		bo = bo->proxy;
+	assert(bo->refcnt);
+	assert(!bo->purged);
+
+	if (bo->domain != DOMAIN_CPU || FORCE_MMAP_SYNC & (1 << DOMAIN_CPU)) {
+		struct drm_i915_gem_set_domain set_domain;
+
+		DBG(("%s: SYNC: handle=%d, needs_flush? %d, domain? %d, busy? %d\n",
+		     __FUNCTION__, bo->handle,
+		     bo->needs_flush, bo->domain,
+		     __kgem_busy(kgem, bo->handle)));
+
+		VG_CLEAR(set_domain);
+		set_domain.handle = bo->handle;
+		set_domain.read_domains = I915_GEM_DOMAIN_CPU;
+		set_domain.write_domain = write ? I915_GEM_DOMAIN_CPU : 0;
+
+		if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain) == 0) {
+			if (bo->exec == NULL)
+				kgem_bo_retire(kgem, bo);
+			bo->domain = write ? DOMAIN_CPU : DOMAIN_NONE;
+		}
+	}
+}
+
+void kgem_bo_sync__gtt(struct kgem *kgem, struct kgem_bo *bo)
+{
+	DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
+	assert(bo->refcnt);
+	assert(bo->proxy == NULL);
+
+	kgem_bo_submit(kgem, bo);
+
+	if (bo->domain != DOMAIN_GTT || FORCE_MMAP_SYNC & (1 << DOMAIN_GTT)) {
+		struct drm_i915_gem_set_domain set_domain;
+
+		DBG(("%s: SYNC: handle=%d, needs_flush? %d, domain? %d, busy? %d\n",
+		     __FUNCTION__, bo->handle,
+		     bo->needs_flush, bo->domain,
+		     __kgem_busy(kgem, bo->handle)));
+
+		VG_CLEAR(set_domain);
+		set_domain.handle = bo->handle;
+		set_domain.read_domains = I915_GEM_DOMAIN_GTT;
+		set_domain.write_domain = I915_GEM_DOMAIN_GTT;
+
+		if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain) == 0) {
+			kgem_bo_retire(kgem, bo);
+			bo->domain = DOMAIN_GTT;
+			bo->gtt_dirty = true;
+		}
+	}
+}
+
 void kgem_clear_dirty(struct kgem *kgem)
 {
 	struct list * const buffers = &kgem->next_request->buffers;
@@ -4542,7 +4845,7 @@ struct kgem_bo *kgem_create_proxy(struct kgem *kgem,
 	bo->proxy = kgem_bo_reference(target);
 	bo->delta = offset;
 
-	if (target->exec) {
+	if (target->exec && !bo->io) {
 		list_move_tail(&bo->request, &kgem->next_request->buffers);
 		bo->exec = &_kgem_dummy_exec;
 	}
@@ -4563,7 +4866,7 @@ buffer_alloc(void)
 
 	bo->mem = NULL;
 	bo->need_io = false;
-	bo->mmapped = true;
+	bo->mmapped = MMAPPED_CPU;
 
 	return bo;
 }
@@ -4638,7 +4941,7 @@ search_snoopable_buffer(struct kgem *kgem, unsigned alloc)
 		assert(bo->base.snoop);
 		assert(bo->base.tiling == I915_TILING_NONE);
 		assert(num_pages(&bo->base) >= alloc);
-		assert(bo->mmapped == true);
+		assert(bo->mmapped == MMAPPED_CPU);
 		assert(bo->need_io == false);
 
 		bo->mem = kgem_bo_map__cpu(kgem, &bo->base);
@@ -4685,7 +4988,7 @@ create_snoopable_buffer(struct kgem *kgem, unsigned alloc)
 		}
 
 		assert(bo->base.refcnt == 1);
-		assert(bo->mmapped == true);
+		assert(bo->mmapped == MMAPPED_CPU);
 		assert(bo->need_io == false);
 
 		bo->mem = kgem_bo_map__cpu(kgem, &bo->base);
@@ -4721,7 +5024,7 @@ create_snoopable_buffer(struct kgem *kgem, unsigned alloc)
 		}
 
 		assert(bo->base.refcnt == 1);
-		assert(bo->mmapped == true);
+		assert(bo->mmapped == MMAPPED_CPU);
 		assert(bo->need_io == false);
 
 		if (!gem_set_caching(kgem->fd, bo->base.handle, SNOOPED))
@@ -4763,12 +5066,12 @@ free_caching:
 		DBG(("%s: created snoop handle=%d for buffer\n",
 		     __FUNCTION__, bo->base.handle));
 
-		assert(bo->mmapped == true);
+		assert(bo->mmapped == MMAPPED_CPU);
 		assert(bo->need_io == false);
 
 		bo->base.refcnt = 1;
 		bo->base.snoop = true;
-		bo->base.map = MAKE_USER_MAP(bo->mem);
+		bo->base.map__cpu = MAKE_USER_MAP(bo->mem);
 
 		return bo;
 	}
@@ -4801,11 +5104,12 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 		/* We can reuse any write buffer which we can fit */
 		if (flags == KGEM_BUFFER_LAST &&
 		    bo->write == KGEM_BUFFER_WRITE &&
-		    bo->base.refcnt == 1 && !bo->mmapped &&
+		    bo->base.refcnt == 1 &&
+		    bo->mmapped == MMAPPED_NONE &&
 		    size <= bytes(&bo->base)) {
 			DBG(("%s: reusing write buffer for read of %d bytes? used=%d, total=%d\n",
 			     __FUNCTION__, size, bo->used, bytes(&bo->base)));
-			gem_write(kgem->fd, bo->base.handle,
+			gem_write__cachealigned(kgem->fd, bo->base.handle,
 				  0, bo->used, bo->mem);
 			kgem_buffer_release(kgem, bo);
 			bo->need_io = 0;
@@ -4845,10 +5149,11 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 		list_for_each_entry(bo, &kgem->active_buffers, base.list) {
 			assert(bo->base.io);
 			assert(bo->base.refcnt >= 1);
+			assert(bo->base.exec == NULL);
 			assert(bo->mmapped);
-			assert(!IS_CPU_MAP(bo->base.map) || kgem->has_llc || bo->base.snoop);
+			assert(bo->mmapped == MMAPPED_GTT || kgem->has_llc || bo->base.snoop);
 
-			if (!kgem->has_llc && (bo->write & ~flags) & KGEM_BUFFER_INPLACE) {
+			if ((bo->write & ~flags) & KGEM_BUFFER_INPLACE && !bo->base.snoop) {
 				DBG(("%s: skip write %x buffer, need %x\n",
 				     __FUNCTION__, bo->write, flags));
 				continue;
@@ -4862,6 +5167,29 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 				list_move(&bo->base.list, &kgem->batch_buffers);
 				goto done;
 			}
+
+			if (size <= bytes(&bo->base) &&
+			    (bo->base.rq == NULL ||
+			     !__kgem_busy(kgem, bo->base.handle))) {
+				DBG(("%s: reusing whole buffer? size=%d, total=%d\n",
+				     __FUNCTION__, size, bytes(&bo->base)));
+				__kgem_bo_clear_busy(&bo->base);
+				kgem_buffer_release(kgem, bo);
+
+				switch (bo->mmapped) {
+				case MMAPPED_CPU:
+					kgem_bo_sync__cpu(kgem, &bo->base);
+					break;
+				case MMAPPED_GTT:
+					kgem_bo_sync__gtt(kgem, &bo->base);
+					break;
+				}
+
+				offset = 0;
+				bo->used = size;
+				list_move(&bo->base.list, &kgem->batch_buffers);
+				goto done;
+			}
 		}
 	}
 #endif
@@ -4875,9 +5203,9 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 		alloc = PAGE_ALIGN(size);
 	assert(alloc);
 
+	alloc /= PAGE_SIZE;
 	if (alloc > kgem->aperture_mappable / 4)
 		flags &= ~KGEM_BUFFER_INPLACE;
-	alloc /= PAGE_SIZE;
 
 	if (kgem->has_llc &&
 	    (flags & KGEM_BUFFER_WRITE_INPLACE) != KGEM_BUFFER_WRITE_INPLACE) {
@@ -4963,7 +5291,7 @@ skip_llc:
 						  CREATE_EXACT | CREATE_INACTIVE | CREATE_GTT_MAP);
 		if (old == NULL) {
 			old = search_linear_cache(kgem, alloc, CREATE_INACTIVE);
-			if (old && !__kgem_bo_is_mappable(kgem, old)) {
+			if (old && !kgem_bo_can_map(kgem, old)) {
 				_kgem_bo_destroy(kgem, old);
 				old = NULL;
 			}
@@ -4971,7 +5299,7 @@ skip_llc:
 		if (old) {
 			DBG(("%s: reusing handle=%d for buffer\n",
 			     __FUNCTION__, old->handle));
-			assert(__kgem_bo_is_mappable(kgem, old));
+			assert(kgem_bo_can_map(kgem, old));
 			assert(!old->snoop);
 			assert(old->rq == NULL);
 
@@ -4987,8 +5315,10 @@ skip_llc:
 
 			bo->mem = kgem_bo_map(kgem, &bo->base);
 			if (bo->mem) {
-				if (IS_CPU_MAP(bo->base.map))
+				if (bo->mem == MAP(bo->base.map__cpu))
 					flags &= ~KGEM_BUFFER_INPLACE;
+				else
+					bo->mmapped = MMAPPED_GTT;
 				goto init;
 			} else {
 				bo->base.refcnt = 0;
@@ -5107,7 +5437,8 @@ init:
 	assert(!bo->need_io || !bo->base.needs_flush);
 	assert(!bo->need_io || bo->base.domain != DOMAIN_GPU);
 	assert(bo->mem);
-	assert(!bo->mmapped || bo->base.map != NULL);
+	assert(bo->mmapped != MMAPPED_GTT || MAP(bo->base.map__gtt) == bo->mem);
+	assert(bo->mmapped != MMAPPED_CPU || MAP(bo->base.map__cpu) == bo->mem);
 
 	bo->used = size;
 	bo->write = flags & KGEM_BUFFER_WRITE_INPLACE;
@@ -5121,6 +5452,7 @@ init:
 
 done:
 	bo->used = ALIGN(bo->used, UPLOAD_ALIGNMENT);
+	assert(bo->used && bo->used <= bytes(&bo->base));
 	assert(bo->mem);
 	*ret = (char *)bo->mem + offset;
 	return kgem_create_proxy(kgem, &bo->base, offset, size);
@@ -5177,7 +5509,7 @@ struct kgem_bo *kgem_create_buffer_2d(struct kgem *kgem,
 		bo->size.bytes -= stride;
 	}
 
-	bo->map = MAKE_CPU_MAP(*ret);
+	bo->map__cpu = *ret;
 	bo->pitch = stride;
 	bo->unique_id = kgem_get_unique_id(kgem);
 	return bo;
@@ -5222,10 +5554,10 @@ void kgem_proxy_bo_attach(struct kgem_bo *bo,
 			  struct kgem_bo **ptr)
 {
 	DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
-	assert(bo->map == NULL || IS_CPU_MAP(bo->map));
+	assert(bo->map__gtt == NULL);
 	assert(bo->proxy);
 	list_add(&bo->vma, &bo->proxy->vma);
-	bo->map = ptr;
+	bo->map__gtt = ptr;
 	*ptr = kgem_bo_reference(bo);
 }
 
@@ -5258,13 +5590,13 @@ void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *_bo)
 		     bo->base.domain,
 		     __kgem_busy(kgem, bo->base.handle)));
 
-		assert(!IS_CPU_MAP(bo->base.map) || bo->base.snoop || kgem->has_llc);
+		assert(bo->mmapped == MMAPPED_GTT || bo->base.snoop || kgem->has_llc);
 
 		VG_CLEAR(set_domain);
 		set_domain.handle = bo->base.handle;
 		set_domain.write_domain = 0;
 		set_domain.read_domains =
-			IS_CPU_MAP(bo->base.map) ? I915_GEM_DOMAIN_CPU : I915_GEM_DOMAIN_GTT;
+			bo->mmapped == MMAPPED_CPU ? I915_GEM_DOMAIN_CPU : I915_GEM_DOMAIN_GTT;
 
 		if (drmIoctl(kgem->fd,
 			     DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain))
diff --git a/contrib/sdk/sources/Intel-2D/sna/kgem.h b/contrib/sdk/sources/Intel-2D/sna/kgem.h
index 64e2a4d882..a00672cbb6 100644
--- a/contrib/sdk/sources/Intel-2D/sna/kgem.h
+++ b/contrib/sdk/sources/Intel-2D/sna/kgem.h
@@ -71,9 +71,8 @@ struct kgem_bo {
 	struct list request;
 	struct list vma;
 
-    void     *map;
-#define IS_CPU_MAP(ptr) ((uintptr_t)(ptr) & 1)
-#define IS_GTT_MAP(ptr) (ptr && ((uintptr_t)(ptr) & 1) == 0)
+	void *map__cpu;
+	void *map__gtt;
 #define MAP(ptr) ((void*)((uintptr_t)(ptr) & ~3))
 
 	struct kgem_bo_binding {
@@ -82,11 +81,11 @@ struct kgem_bo {
 		uint16_t offset;
 	} binding;
 
+	uint64_t presumed_offset;
 	uint32_t unique_id;
 	uint32_t refcnt;
 	uint32_t handle;
 	uint32_t target_handle;
-	uint32_t presumed_offset;
 	uint32_t delta;
 	union {
 		struct {
@@ -200,11 +199,12 @@ struct kgem {
 	uint32_t has_handle_lut :1;
 
 	uint32_t can_blt_cpu :1;
+	uint32_t can_render_y :1;
 
 	uint16_t fence_max;
 	uint16_t half_cpu_cache_pages;
 	uint32_t aperture_total, aperture_high, aperture_low, aperture_mappable;
-	uint32_t aperture, aperture_fenced;
+	uint32_t aperture, aperture_fenced, aperture_max_fence;
 	uint32_t max_upload_tile_size, max_copy_tile_size;
 	uint32_t max_gpu_size, max_cpu_size;
 	uint32_t large_object_size, max_object_size;
@@ -313,6 +313,8 @@ struct kgem_bo *kgem_create_cpu_2d(struct kgem *kgem,
 				   int bpp,
 				   uint32_t flags);
 
+bool kgem_bo_convert_to_gpu(struct kgem *kgem, struct kgem_bo *bo);
+
 uint32_t kgem_bo_get_binding(struct kgem_bo *bo, uint32_t format);
 void kgem_bo_set_binding(struct kgem_bo *bo, uint32_t format, uint16_t offset);
 
@@ -349,14 +351,6 @@ static inline void kgem_submit(struct kgem *kgem)
 		_kgem_submit(kgem);
 }
 
-static inline bool kgem_flush(struct kgem *kgem, bool flush)
-{
-	if (kgem->nreloc == 0)
-		return false;
-
-	return (kgem->flush ^ flush) && kgem_ring_is_idle(kgem, kgem->ring);
-}
-
 static inline void kgem_bo_submit(struct kgem *kgem, struct kgem_bo *bo)
 {
 	if (bo->exec)
@@ -392,8 +386,10 @@ static inline void kgem_set_mode(struct kgem *kgem,
 	kgem_submit(kgem);
 #endif
 
-	if (kgem->nreloc && bo->exec == NULL && kgem_ring_is_idle(kgem, kgem->ring))
+	if (kgem->nreloc && bo->exec == NULL && kgem_ring_is_idle(kgem, kgem->ring)) {
+		DBG(("%s: flushing before new bo\n", __FUNCTION__));
 		_kgem_submit(kgem);
+	}
 
 	if (kgem->mode == mode)
 		return;
@@ -466,6 +462,11 @@ uint32_t kgem_add_reloc(struct kgem *kgem,
 			struct kgem_bo *bo,
 			uint32_t read_write_domains,
 			uint32_t delta);
+uint64_t kgem_add_reloc64(struct kgem *kgem,
+			  uint32_t pos,
+			  struct kgem_bo *bo,
+			  uint32_t read_write_domains,
+			  uint64_t delta);
 
 void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo);
 void *kgem_bo_map__async(struct kgem *kgem, struct kgem_bo *bo);
@@ -475,15 +476,13 @@ void *kgem_bo_map__debug(struct kgem *kgem, struct kgem_bo *bo);
 void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo);
 void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo);
 void kgem_bo_sync__cpu_full(struct kgem *kgem, struct kgem_bo *bo, bool write);
-void *__kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo);
-void __kgem_bo_unmap__cpu(struct kgem *kgem, struct kgem_bo *bo, void *ptr);
 uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo);
 
 bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
 		   const void *data, int length);
 
 int kgem_bo_fenced_size(struct kgem *kgem, struct kgem_bo *bo);
-void kgem_get_tile_size(struct kgem *kgem, int tiling,
+void kgem_get_tile_size(struct kgem *kgem, int tiling, int pitch,
 			int *tile_width, int *tile_height, int *tile_size);
 
 static inline int __kgem_buffer_size(struct kgem_bo *bo)
@@ -498,6 +497,12 @@ static inline int __kgem_bo_size(struct kgem_bo *bo)
 	return PAGE_SIZE * bo->size.pages.count;
 }
 
+static inline int __kgem_bo_num_pages(struct kgem_bo *bo)
+{
+	assert(bo->proxy == NULL);
+	return bo->size.pages.count;
+}
+
 static inline int kgem_bo_size(struct kgem_bo *bo)
 {
 	if (bo->proxy)
@@ -506,7 +511,6 @@ static inline int kgem_bo_size(struct kgem_bo *bo)
 		return __kgem_bo_size(bo);
 }
 
-/*
 static inline bool kgem_bo_blt_pitch_is_ok(struct kgem *kgem,
 					   struct kgem_bo *bo)
 {
@@ -533,80 +537,6 @@ static inline bool kgem_bo_can_blt(struct kgem *kgem,
 
 	return kgem_bo_blt_pitch_is_ok(kgem, bo);
 }
-*/
-
-static inline bool __kgem_bo_is_mappable(struct kgem *kgem,
-				       struct kgem_bo *bo)
-{
-	if (bo->domain == DOMAIN_GTT)
-		return true;
-
-	if (kgem->gen < 040 && bo->tiling &&
-	    bo->presumed_offset & (kgem_bo_fenced_size(kgem, bo) - 1))
-		return false;
-
-	if (kgem->gen == 021 && bo->tiling == I915_TILING_Y)
-		return false;
-
-	if (kgem->has_llc && bo->tiling == I915_TILING_NONE)
-		return true;
-
-	if (!bo->presumed_offset)
-		return kgem_bo_size(bo) <= kgem->aperture_mappable / 4;
-
-	return bo->presumed_offset + kgem_bo_size(bo) <= kgem->aperture_mappable;
-}
-
-static inline bool kgem_bo_is_mappable(struct kgem *kgem,
-				       struct kgem_bo *bo)
-{
-	DBG(("%s: domain=%d, offset: %d size: %d\n",
-	     __FUNCTION__, bo->domain, bo->presumed_offset, kgem_bo_size(bo)));
-	assert(bo->refcnt);
-	return __kgem_bo_is_mappable(kgem, bo);
-}
-
-static inline bool kgem_bo_mapped(struct kgem *kgem, struct kgem_bo *bo)
-{
-	DBG(("%s: map=%p, tiling=%d, domain=%d\n",
-	     __FUNCTION__, bo->map, bo->tiling, bo->domain));
-	assert(bo->refcnt);
-
-	if (bo->map == NULL)
-		return bo->tiling == I915_TILING_NONE && bo->domain == DOMAIN_CPU;
-
-	return IS_CPU_MAP(bo->map) == !bo->tiling;
-}
-
-static inline bool kgem_bo_can_map(struct kgem *kgem, struct kgem_bo *bo)
-{
-	if (kgem_bo_mapped(kgem, bo))
-		return true;
-
-	if (!bo->tiling && (kgem->has_llc || bo->domain == DOMAIN_CPU))
-		return true;
-
-	if (kgem->gen == 021 && bo->tiling == I915_TILING_Y)
-		return false;
-
-	return kgem_bo_size(bo) <= kgem->aperture_mappable / 4;
-}
-
-static inline bool kgem_bo_can_map__cpu(struct kgem *kgem,
-					struct kgem_bo *bo,
-					bool write)
-{
-	if (bo->purged || (bo->scanout && write))
-		return false;
-
-	if (kgem->has_llc)
-		return true;
-
-	if (bo->domain != DOMAIN_CPU)
-		return false;
-
-	return !write || bo->exec == NULL;
-}
 
 static inline bool kgem_bo_is_snoop(struct kgem_bo *bo)
 {
@@ -652,9 +582,6 @@ static inline bool __kgem_bo_is_busy(struct kgem *kgem, struct kgem_bo *bo)
 	if (bo->exec)
 		return true;
 
-	if (kgem_flush(kgem, bo->flush))
-		kgem_submit(kgem);
-
 	if (bo->rq && !__kgem_busy(kgem, bo->handle))
 		__kgem_bo_clear_busy(bo);
 
@@ -723,6 +650,53 @@ static inline void kgem_bo_mark_dirty(struct kgem_bo *bo)
 	} while ((bo = bo->proxy));
 }
 
+static inline bool kgem_bo_mapped(struct kgem *kgem, struct kgem_bo *bo)
+{
+	DBG(("%s: map=%p:%p, tiling=%d, domain=%d\n",
+	     __FUNCTION__, bo->map__gtt, bo->map__cpu, bo->tiling, bo->domain));
+
+	if (bo->tiling == I915_TILING_NONE && (bo->domain == DOMAIN_CPU || kgem->has_llc))
+		return bo->map__cpu != NULL;
+
+	return bo->map__gtt != NULL;
+}
+
+static inline bool kgem_bo_can_map(struct kgem *kgem, struct kgem_bo *bo)
+{
+	DBG(("%s: map=%p:%p, tiling=%d, domain=%d, offset=%ld\n",
+	     __FUNCTION__, bo->map__gtt, bo->map__cpu, bo->tiling, bo->domain, (long)bo->presumed_offset));
+
+	if (!bo->tiling && (kgem->has_llc || bo->domain == DOMAIN_CPU))
+		return true;
+
+	if (bo->map__gtt != NULL)
+		return true;
+
+	if (kgem->gen == 021 && bo->tiling == I915_TILING_Y)
+		return false;
+
+	if (!bo->presumed_offset)
+		return __kgem_bo_num_pages(bo) <= kgem->aperture_mappable / 4;
+
+	return bo->presumed_offset / PAGE_SIZE + __kgem_bo_num_pages(bo) <= kgem->aperture_mappable;
+}
+
+static inline bool kgem_bo_can_map__cpu(struct kgem *kgem,
+					struct kgem_bo *bo,
+					bool write)
+{
+	if (bo->purged || (bo->scanout && write))
+		return false;
+
+	if (kgem->has_llc)
+		return true;
+
+	if (bo->domain != DOMAIN_CPU)
+		return false;
+
+	return !write || bo->exec == NULL;
+}
+
 #define KGEM_BUFFER_WRITE	0x1
 #define KGEM_BUFFER_INPLACE	0x2
 #define KGEM_BUFFER_LAST	0x4
@@ -742,8 +716,7 @@ void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *bo);
 void kgem_throttle(struct kgem *kgem);
 #define MAX_INACTIVE_TIME 10
 bool kgem_expire_cache(struct kgem *kgem);
-void kgem_purge_cache(struct kgem *kgem);
-void kgem_cleanup_cache(struct kgem *kgem);
+bool kgem_cleanup_cache(struct kgem *kgem);
 
 void kgem_clean_scanout_cache(struct kgem *kgem);
 void kgem_clean_large_cache(struct kgem *kgem);
@@ -758,4 +731,6 @@ static inline void __kgem_batch_debug(struct kgem *kgem, uint32_t nbatch)
 }
 #endif
 
+void choose_memcpy_tiled_x(struct kgem *kgem, int swizzling);
+
 #endif /* KGEM_H */
diff --git a/contrib/sdk/sources/Intel-2D/sna/sna.c b/contrib/sdk/sources/Intel-2D/sna/sna.c
index d6facc9dec..b4f17c86f8 100644
--- a/contrib/sdk/sources/Intel-2D/sna/sna.c
+++ b/contrib/sdk/sources/Intel-2D/sna/sna.c
@@ -706,6 +706,19 @@ sna_wait_for_scanline(struct sna *sna,
 
 
 
+int intel_get_device_id(struct sna *sna)
+{
+    struct drm_i915_getparam gp;
+    int devid = 0;
+
+    memset(&gp, 0, sizeof(gp));
+    gp.param = I915_PARAM_CHIPSET_ID;
+    gp.value = &devid;
+
+    if (drmIoctl(sna->scrn, DRM_IOCTL_I915_GETPARAM, &gp))
+        return 0;
+    return devid;
+}
 
 static const struct intel_device_info intel_generic_info = {
 	.gen = -1,
@@ -814,21 +827,6 @@ intel_detect_chipset(struct pci_device *pci)
         return &intel_generic_info;
 }
 
-int intel_get_device_id(int fd)
-{
-	struct drm_i915_getparam gp;
-	int devid = 0;
-
-	memset(&gp, 0, sizeof(gp));
-	gp.param = I915_PARAM_CHIPSET_ID;
-	gp.value = &devid;
-
-	if (drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, &gp))
-		return 0;
-
-	return devid;
-}
-
 int drmIoctl(int fd, unsigned long request, void *arg)
 {
     ioctl_t  io;
diff --git a/contrib/sdk/sources/Intel-2D/sna/sna.h b/contrib/sdk/sources/Intel-2D/sna/sna.h
index f11790b2f4..b79160fb4f 100644
--- a/contrib/sdk/sources/Intel-2D/sna/sna.h
+++ b/contrib/sdk/sources/Intel-2D/sna/sna.h
@@ -448,7 +448,8 @@ struct sna {
     unsigned flags;
 #define SNA_NO_WAIT		0x1
 #define SNA_NO_FLIP		0x2
-#define SNA_TRIPLE_BUFFER	0x4
+#define SNA_NO_VSYNC		0x4
+#define SNA_TRIPLE_BUFFER	0x8
 #define SNA_TEAR_FREE		0x10
 #define SNA_FORCE_SHADOW	0x20
 #define SNA_FLUSH_GTT		0x40
@@ -490,6 +491,7 @@ struct sna {
         uint32_t fill_alu;
     } blt_state;
     union {
+		unsigned gt;
         struct gen3_render_state gen3;
         struct gen4_render_state gen4;
         struct gen5_render_state gen5;
@@ -497,6 +499,8 @@ struct sna {
 		struct gen7_render_state gen7;
     } render_state;
 
+	bool dri_available;
+	bool dri_open;
 
     /* Broken-out options. */
 //    OptionInfoPtr Options;
@@ -611,4 +615,7 @@ sna_transform_equal(const PictTransform *a, const PictTransform *b)
 
 	return memcmp(a, b, sizeof(*a)) == 0;
 }
+
+int intel_get_device_id(struct sna *sna);
+
 #endif /* _SNA_H */
diff --git a/contrib/sdk/sources/Intel-2D/sna/sna_reg.h b/contrib/sdk/sources/Intel-2D/sna/sna_reg.h
index 26282361c0..bda6ef67e9 100644
--- a/contrib/sdk/sources/Intel-2D/sna/sna_reg.h
+++ b/contrib/sdk/sources/Intel-2D/sna/sna_reg.h
@@ -42,22 +42,22 @@
 #define BLT_SRC_TILED		(1<<15)
 #define BLT_DST_TILED		(1<<11)
 
-#define COLOR_BLT_CMD			((2<<29)|(0x40<<22)|(0x3))
-#define XY_COLOR_BLT			((2<<29)|(0x50<<22)|(0x4))
-#define XY_SETUP_BLT			((2<<29)|(1<<22)|6)
-#define XY_SETUP_MONO_PATTERN_SL_BLT	((2<<29)|(0x11<<22)|7)
-#define XY_SETUP_CLIP			((2<<29)|(3<<22)|1)
-#define XY_SCANLINE_BLT			((2<<29)|(0x25<<22)|1)
-#define XY_TEXT_IMMEDIATE_BLT		((2<<29)|(0x31<<22)|(1<<16))
-#define XY_SRC_COPY_BLT_CMD		((2<<29)|(0x53<<22)|6)
-#define SRC_COPY_BLT_CMD		((2<<29)|(0x43<<22)|0x4)
-#define XY_PAT_BLT			((2<<29)|(0x51<<22)|0x4)
-#define XY_PAT_BLT_IMMEDIATE		((2<<29)|(0x72<<22))
-#define XY_MONO_PAT			((0x2<<29)|(0x52<<22)|0x7)
-#define XY_MONO_SRC_COPY		((0x2<<29)|(0x54<<22)|(0x6))
-#define XY_MONO_SRC_COPY_IMM		((0x2<<29)|(0x71<<22))
-#define XY_FULL_MONO_PATTERN_BLT	((0x2<<29)|(0x57<<22)|0xa)
-#define XY_FULL_MONO_PATTERN_MONO_SRC_BLT	((0x2<<29)|(0x58<<22)|0xa)
+#define COLOR_BLT_CMD			(2<<29|0x40<<22|(0x3))
+#define XY_COLOR_BLT			(2<<29|0x50<<22|(0x4))
+#define XY_SETUP_BLT			(2<<29|0x01<<22)
+#define XY_SETUP_MONO_PATTERN_SL_BLT	(2<<29|0x11<<22)
+#define XY_SETUP_CLIP			(2<<29|0x03<<22|1)
+#define XY_SCANLINE_BLT			(2<<29|0x25<<22|1)
+#define XY_TEXT_IMMEDIATE_BLT		(2<<29|0x31<<22|(1<<16))
+#define XY_SRC_COPY_BLT_CMD		(2<<29|0x53<<22)
+#define SRC_COPY_BLT_CMD		(2<<29|0x43<<22|0x4)
+#define XY_PAT_BLT			(2<<29|0x51<<22)
+#define XY_PAT_BLT_IMMEDIATE		(2<<29|0x72<<22)
+#define XY_MONO_PAT			(2<<29|0x52<<22)
+#define XY_MONO_SRC_COPY		(2<<29|0x54<<22)
+#define XY_MONO_SRC_COPY_IMM		(2<<29|0x71<<22)
+#define XY_FULL_MONO_PATTERN_BLT	(2<<29|0x57<<22)
+#define XY_FULL_MONO_PATTERN_MONO_SRC_BLT (2<<29|0x58<<22)
 
 /* FLUSH commands */
 #define BRW_3D(Pipeline,Opcode,Subopcode) \
diff --git a/contrib/sdk/sources/Intel-2D/sna/sna_render.h b/contrib/sdk/sources/Intel-2D/sna/sna_render.h
index 4ab8acf544..ad0ec41d38 100644
--- a/contrib/sdk/sources/Intel-2D/sna/sna_render.h
+++ b/contrib/sdk/sources/Intel-2D/sna/sna_render.h
@@ -104,6 +104,7 @@ struct sna_composite_op {
             uint32_t inplace :1;
             uint32_t overwrites:1;
             uint32_t bpp : 6;
+			uint32_t alu : 4;
 
             uint32_t cmd;
             uint32_t br13;
@@ -245,7 +246,7 @@ struct sna_render {
     struct sna_solid_cache {
          struct kgem_bo *cache_bo;
          struct kgem_bo *bo[1024];
-		uint32_t color[1025];
+		uint32_t color[1024];
          int last;
          int size;
          int dirty;
@@ -381,6 +382,7 @@ enum {
 };
 
 struct gen6_render_state {
+	unsigned gt;
 	const struct gt_info *info;
     struct kgem_bo *general_bo;
 
@@ -430,6 +432,7 @@ enum {
 };
 
 struct gen7_render_state {
+	unsigned gt;
 	const struct gt_info *info;
 	struct kgem_bo *general_bo;
 
diff --git a/contrib/sdk/sources/Intel-2D/uxa/uxa.c b/contrib/sdk/sources/Intel-2D/uxa/uxa.c
index ba5de045ac..14defe797e 100644
--- a/contrib/sdk/sources/Intel-2D/uxa/uxa.c
+++ b/contrib/sdk/sources/Intel-2D/uxa/uxa.c
@@ -70,10 +70,10 @@ static void i830_done_composite(PixmapPtr dest)
 //	intel_debug_flush(scrn);
 }
 
-int sna_bitmap_from_handle(bitmap_t *bitmap, uint32_t handle)
+int uxa_bitmap_from_handle(bitmap_t *bitmap, uint32_t handle)
 {
-	struct intel_screen_private *intel = intel_get_screen_private();
-	drm_intel_bo *bo;
+    struct intel_screen_private *intel = intel_get_screen_private();
+    drm_intel_bo *bo;
     surface_t    *sf;
     unsigned int size;
 
@@ -118,14 +118,14 @@ err_1:
     return -1;
 };
 
-void sna_set_bo_handle(bitmap_t *bitmap, int handle)
+void uxa_set_bo_handle(bitmap_t *bitmap, int handle)
 {
-    sna_bitmap_from_handle(bitmap, handle);
+    uxa_bitmap_from_handle(bitmap, handle);
 };
 
 
-int sna_blit_tex(bitmap_t *bitmap, bool scale, int dst_x, int dst_y,
-                  int w, int h, int src_x, int src_y)
+int uxa_blit_tex(bitmap_t *bitmap, int scale, int vsync,
+                 int dst_x, int dst_y,int w, int h, int src_x, int src_y)
 {
 //    DBG("%s\n", __FUNCTION__);