diff --git a/drivers/video/Gallium/auxiliary/cso_cache/cso_cache.c b/drivers/video/Gallium/auxiliary/cso_cache/cso_cache.c
new file mode 100644
index 0000000000..e276fd1390
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/cso_cache/cso_cache.c
@@ -0,0 +1,322 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/* Authors:  Zack Rusin <zack@tungstengraphics.com>
+ */
+
+#include "util/u_debug.h"
+
+#include "util/u_memory.h"
+
+#include "cso_cache.h"
+#include "cso_hash.h"
+
+
+struct cso_cache {
+   struct cso_hash *hashes[CSO_CACHE_MAX];
+   int    max_size;
+
+   cso_sanitize_callback sanitize_cb;
+   void                 *sanitize_data;
+};
+
+#if 1
+static unsigned hash_key(const void *key, unsigned key_size)
+{
+   unsigned *ikey = (unsigned *)key;
+   unsigned hash = 0, i;
+
+   assert(key_size % 4 == 0);
+
+   /* I'm sure this can be improved on:
+    */
+   for (i = 0; i < key_size/4; i++)
+      hash ^= ikey[i];
+
+   return hash;
+}
+#else
+static unsigned hash_key(const unsigned char *p, int n)
+{
+   unsigned h = 0;
+   unsigned g;
+
+   while (n--) {
+      h = (h << 4) + *p++;
+      if ((g = (h & 0xf0000000)) != 0)
+         h ^= g >> 23;
+      h &= ~g;
+   }
+   return h;
+}
+#endif
+
+unsigned cso_construct_key(void *item, int item_size)
+{
+   return hash_key((item), item_size);
+}
+
+static INLINE struct cso_hash *_cso_hash_for_type(struct cso_cache *sc, enum cso_cache_type type)
+{
+   struct cso_hash *hash;
+   hash = sc->hashes[type];
+   return hash;
+}
+
+static void delete_blend_state(void *state, void *data)
+{
+   struct cso_blend *cso = (struct cso_blend *)state;
+   if (cso->delete_state)
+      cso->delete_state(cso->context, cso->data);
+   FREE(state);
+}
+
+static void delete_depth_stencil_state(void *state, void *data)
+{
+   struct cso_depth_stencil_alpha *cso = (struct cso_depth_stencil_alpha *)state;
+   if (cso->delete_state)
+      cso->delete_state(cso->context, cso->data);
+   FREE(state);
+}
+
+static void delete_sampler_state(void *state, void *data)
+{
+   struct cso_sampler *cso = (struct cso_sampler *)state;
+   if (cso->delete_state)
+      cso->delete_state(cso->context, cso->data);
+   FREE(state);
+}
+
+static void delete_rasterizer_state(void *state, void *data)
+{
+   struct cso_rasterizer *cso = (struct cso_rasterizer *)state;
+   if (cso->delete_state)
+      cso->delete_state(cso->context, cso->data);
+   FREE(state);
+}
+
+static void delete_velements(void *state, void *data)
+{
+   struct cso_velements *cso = (struct cso_velements *)state;
+   if (cso->delete_state)
+      cso->delete_state(cso->context, cso->data);
+   FREE(state);
+}
+
+static INLINE void delete_cso(void *state, enum cso_cache_type type)
+{
+   switch (type) {
+   case CSO_BLEND:
+      delete_blend_state(state, 0);
+      break;
+   case CSO_SAMPLER:
+      delete_sampler_state(state, 0);
+      break;
+   case CSO_DEPTH_STENCIL_ALPHA:
+      delete_depth_stencil_state(state, 0);
+      break;
+   case CSO_RASTERIZER:
+      delete_rasterizer_state(state, 0);
+      break;
+   case CSO_VELEMENTS:
+      delete_velements(state, 0);
+      break;
+   default:
+      assert(0);
+      FREE(state);
+   }
+}
+
+
+static INLINE void sanitize_hash(struct cso_cache *sc,
+                                 struct cso_hash *hash,
+                                 enum cso_cache_type type,
+                                 int max_size)
+{
+   if (sc->sanitize_cb)
+      sc->sanitize_cb(hash, type, max_size, sc->sanitize_data);
+}
+
+
+static INLINE void sanitize_cb(struct cso_hash *hash, enum cso_cache_type type,
+                               int max_size, void *user_data)
+{
+   /* if we're approach the maximum size, remove fourth of the entries
+    * otherwise every subsequent call will go through the same */
+   int hash_size = cso_hash_size(hash);
+   int max_entries = (max_size > hash_size) ? max_size : hash_size;
+   int to_remove =  (max_size < max_entries) * max_entries/4;
+   if (hash_size > max_size)
+      to_remove += hash_size - max_size;
+   while (to_remove) {
+      /*remove elements until we're good */
+      /*fixme: currently we pick the nodes to remove at random*/
+      struct cso_hash_iter iter = cso_hash_first_node(hash);
+      void  *cso = cso_hash_take(hash, cso_hash_iter_key(iter));
+      delete_cso(cso, type);
+      --to_remove;
+   }
+}
+
+struct cso_hash_iter
+cso_insert_state(struct cso_cache *sc,
+                 unsigned hash_key, enum cso_cache_type type,
+                 void *state)
+{
+   struct cso_hash *hash = _cso_hash_for_type(sc, type);
+   sanitize_hash(sc, hash, type, sc->max_size);
+
+   return cso_hash_insert(hash, hash_key, state);
+}
+
+struct cso_hash_iter
+cso_find_state(struct cso_cache *sc,
+               unsigned hash_key, enum cso_cache_type type)
+{
+   struct cso_hash *hash = _cso_hash_for_type(sc, type);
+
+   return cso_hash_find(hash, hash_key);
+}
+
+
+void *cso_hash_find_data_from_template( struct cso_hash *hash,
+				        unsigned hash_key, 
+				        void *templ,
+				        int size )
+{
+   struct cso_hash_iter iter = cso_hash_find(hash, hash_key);
+   while (!cso_hash_iter_is_null(iter)) {
+      void *iter_data = cso_hash_iter_data(iter);
+      if (!memcmp(iter_data, templ, size)) {
+	 /* We found a match
+	  */
+         return iter_data;
+      }
+      iter = cso_hash_iter_next(iter);
+   }
+   return NULL;
+}
+
+
+struct cso_hash_iter cso_find_state_template(struct cso_cache *sc,
+                                             unsigned hash_key, enum cso_cache_type type,
+                                             void *templ, unsigned size)
+{
+   struct cso_hash_iter iter = cso_find_state(sc, hash_key, type);
+   while (!cso_hash_iter_is_null(iter)) {
+      void *iter_data = cso_hash_iter_data(iter);
+      if (!memcmp(iter_data, templ, size))
+         return iter;
+      iter = cso_hash_iter_next(iter);
+   }
+   return iter;
+}
+
+void * cso_take_state(struct cso_cache *sc,
+                      unsigned hash_key, enum cso_cache_type type)
+{
+   struct cso_hash *hash = _cso_hash_for_type(sc, type);
+   return cso_hash_take(hash, hash_key);
+}
+
+struct cso_cache *cso_cache_create(void)
+{
+   struct cso_cache *sc = MALLOC_STRUCT(cso_cache);
+   int i;
+   if (sc == NULL)
+      return NULL;
+
+   sc->max_size           = 4096;
+   for (i = 0; i < CSO_CACHE_MAX; i++)
+      sc->hashes[i] = cso_hash_create();
+
+   sc->sanitize_cb        = sanitize_cb;
+   sc->sanitize_data      = 0;
+
+   return sc;
+}
+
+void cso_for_each_state(struct cso_cache *sc, enum cso_cache_type type,
+                        cso_state_callback func, void *user_data)
+{
+   struct cso_hash *hash = _cso_hash_for_type(sc, type);
+   struct cso_hash_iter iter;
+
+   iter = cso_hash_first_node(hash);
+   while (!cso_hash_iter_is_null(iter)) {
+      void *state = cso_hash_iter_data(iter);
+      iter = cso_hash_iter_next(iter);
+      if (state) {
+         func(state, user_data);
+      }
+   }
+}
+
+void cso_cache_delete(struct cso_cache *sc)
+{
+   int i;
+   assert(sc);
+
+   if (!sc)
+      return;
+
+   /* delete driver data */
+   cso_for_each_state(sc, CSO_BLEND, delete_blend_state, 0);
+   cso_for_each_state(sc, CSO_DEPTH_STENCIL_ALPHA, delete_depth_stencil_state, 0);
+   cso_for_each_state(sc, CSO_RASTERIZER, delete_rasterizer_state, 0);
+   cso_for_each_state(sc, CSO_SAMPLER, delete_sampler_state, 0);
+   cso_for_each_state(sc, CSO_VELEMENTS, delete_velements, 0);
+
+   for (i = 0; i < CSO_CACHE_MAX; i++)
+      cso_hash_delete(sc->hashes[i]);
+
+   FREE(sc);
+}
+
+void cso_set_maximum_cache_size(struct cso_cache *sc, int number)
+{
+   int i;
+
+   sc->max_size = number;
+
+   for (i = 0; i < CSO_CACHE_MAX; i++)
+      sanitize_hash(sc, sc->hashes[i], i, sc->max_size);
+}
+
+int cso_maximum_cache_size(const struct cso_cache *sc)
+{
+   return sc->max_size;
+}
+
+void cso_cache_set_sanitize_callback(struct cso_cache *sc,
+                                     cso_sanitize_callback cb,
+                                     void *user_data)
+{
+   sc->sanitize_cb   = cb;
+   sc->sanitize_data = user_data;
+}
+
diff --git a/drivers/video/Gallium/auxiliary/cso_cache/cso_cache.h b/drivers/video/Gallium/auxiliary/cso_cache/cso_cache.h
new file mode 100644
index 0000000000..cc1f1c0e12
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/cso_cache/cso_cache.h
@@ -0,0 +1,175 @@
+/**************************************************************************
+ *
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /**
+  * @file
+  * Constant State Object (CSO) cache.
+  *
+  * The basic idea is that the states are created via the
+  * create_state/bind_state/delete_state semantics. The driver is expected to
+  * perform as much of the Gallium state translation to whatever its internal
+  * representation is during the create call. Gallium then has a caching
+  * mechanism where it stores the created states. When the pipeline needs an
+  * actual state change, a bind call is issued. In the bind call the driver
+  * gets its already translated representation.
+  *
+  * Those semantics mean that the driver doesn't do the repeated translations
+  * of states on every frame, but only once, when a new state is actually
+  * created.
+  *
+  * Even on hardware that doesn't do any kind of state cache, it makes the
+  * driver look a lot neater, plus it avoids all the redundant state
+  * translations on every frame.
+  *
+  * Currently our constant state objects are:
+  * - alpha test
+  * - blend
+  * - depth stencil
+  * - fragment shader
+  * - rasterizer (old setup)
+  * - sampler
+  * - vertex shader
+  * - vertex elements
+  *
+  * Things that are not constant state objects include:
+  * - blend_color
+  * - clip_state
+  * - clear_color_state
+  * - constant_buffer
+  * - feedback_state
+  * - framebuffer_state
+  * - polygon_stipple
+  * - scissor_state
+  * - texture_state
+  * - viewport_state
+  *
+  * @author Zack Rusin <zack@tungstengraphics.com>
+  */
+
+#ifndef CSO_CACHE_H
+#define CSO_CACHE_H
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+/* cso_hash.h is necessary for cso_hash_iter, as MSVC requires structures
+ * returned by value to be fully defined */
+#include "cso_hash.h"
+
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+enum cso_cache_type {
+   CSO_RASTERIZER,
+   CSO_BLEND,
+   CSO_DEPTH_STENCIL_ALPHA,
+   CSO_SAMPLER,
+   CSO_VELEMENTS,
+   CSO_CACHE_MAX,
+};
+
+typedef void (*cso_state_callback)(void *ctx, void *obj);
+
+typedef void (*cso_sanitize_callback)(struct cso_hash *hash,
+                                      enum cso_cache_type type,
+                                      int max_size,
+                                      void *user_data);
+
+struct cso_cache;
+
+struct cso_blend {
+   struct pipe_blend_state state;
+   void *data;
+   cso_state_callback delete_state;
+   struct pipe_context *context;
+};
+
+struct cso_depth_stencil_alpha {
+   struct pipe_depth_stencil_alpha_state state;
+   void *data;
+   cso_state_callback delete_state;
+   struct pipe_context *context;
+};
+
+struct cso_rasterizer {
+   struct pipe_rasterizer_state state;
+   void *data;
+   cso_state_callback delete_state;
+   struct pipe_context *context;
+};
+
+struct cso_sampler {
+   struct pipe_sampler_state state;
+   void *data;
+   cso_state_callback delete_state;
+   struct pipe_context *context;
+};
+
+struct cso_velems_state {
+   unsigned count;
+   struct pipe_vertex_element velems[PIPE_MAX_ATTRIBS];
+};
+
+struct cso_velements {
+   struct cso_velems_state state;
+   void *data;
+   cso_state_callback delete_state;
+   struct pipe_context *context;
+};
+
+unsigned cso_construct_key(void *item, int item_size);
+
+struct cso_cache *cso_cache_create(void);
+void cso_cache_delete(struct cso_cache *sc);
+
+void cso_cache_set_sanitize_callback(struct cso_cache *sc,
+                                     cso_sanitize_callback cb,
+                                     void *user_data);
+
+struct cso_hash_iter cso_insert_state(struct cso_cache *sc,
+                                      unsigned hash_key, enum cso_cache_type type,
+                                      void *state);
+struct cso_hash_iter cso_find_state(struct cso_cache *sc,
+                                    unsigned hash_key, enum cso_cache_type type);
+struct cso_hash_iter cso_find_state_template(struct cso_cache *sc,
+                                             unsigned hash_key, enum cso_cache_type type,
+                                             void *templ, unsigned size);
+void cso_for_each_state(struct cso_cache *sc, enum cso_cache_type type,
+                        cso_state_callback func, void *user_data);
+void * cso_take_state(struct cso_cache *sc, unsigned hash_key,
+                      enum cso_cache_type type);
+
+void cso_set_maximum_cache_size(struct cso_cache *sc, int number);
+int cso_maximum_cache_size(const struct cso_cache *sc);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif
diff --git a/drivers/video/Gallium/auxiliary/cso_cache/cso_context.c b/drivers/video/Gallium/auxiliary/cso_cache/cso_context.c
new file mode 100644
index 0000000000..6805427b81
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/cso_cache/cso_context.c
@@ -0,0 +1,1431 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /**
+  * @file
+  *
+  * Wrap the cso cache & hash mechanisms in a simplified
+  * pipe-driver-specific interface.
+  *
+  * @author Zack Rusin <zack@tungstengraphics.com>
+  * @author Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "pipe/p_state.h"
+#include "util/u_draw.h"
+#include "util/u_framebuffer.h"
+#include "util/u_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_vbuf.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "cso_cache/cso_context.h"
+#include "cso_cache/cso_cache.h"
+#include "cso_cache/cso_hash.h"
+#include "cso_context.h"
+
+
+/**
+ * Info related to samplers and sampler views.
+ * We have one of these for fragment samplers and another for vertex samplers.
+ */
+struct sampler_info
+{
+   struct {
+      void *samplers[PIPE_MAX_SAMPLERS];
+      unsigned nr_samplers;
+   } hw;
+
+   void *samplers[PIPE_MAX_SAMPLERS];
+   unsigned nr_samplers;
+
+   void *samplers_saved[PIPE_MAX_SAMPLERS];
+   unsigned nr_samplers_saved;
+
+   struct pipe_sampler_view *views[PIPE_MAX_SAMPLERS];
+   unsigned nr_views;
+
+   struct pipe_sampler_view *views_saved[PIPE_MAX_SAMPLERS];
+   unsigned nr_views_saved;
+};
+
+
+
+struct cso_context {
+   struct pipe_context *pipe;
+   struct cso_cache *cache;
+   struct u_vbuf *vbuf;
+
+   boolean has_geometry_shader;
+   boolean has_streamout;
+
+   struct sampler_info samplers[PIPE_SHADER_TYPES];
+
+   struct pipe_vertex_buffer aux_vertex_buffer_current;
+   struct pipe_vertex_buffer aux_vertex_buffer_saved;
+   unsigned aux_vertex_buffer_index;
+
+   struct pipe_constant_buffer aux_constbuf_current[PIPE_SHADER_TYPES];
+   struct pipe_constant_buffer aux_constbuf_saved[PIPE_SHADER_TYPES];
+
+   unsigned nr_so_targets;
+   struct pipe_stream_output_target *so_targets[PIPE_MAX_SO_BUFFERS];
+
+   unsigned nr_so_targets_saved;
+   struct pipe_stream_output_target *so_targets_saved[PIPE_MAX_SO_BUFFERS];
+
+   /** Current and saved state.
+    * The saved state is used as a 1-deep stack.
+    */
+   void *blend, *blend_saved;
+   void *depth_stencil, *depth_stencil_saved;
+   void *rasterizer, *rasterizer_saved;
+   void *fragment_shader, *fragment_shader_saved;
+   void *vertex_shader, *vertex_shader_saved;
+   void *geometry_shader, *geometry_shader_saved;
+   void *velements, *velements_saved;
+   struct pipe_query *render_condition, *render_condition_saved;
+   uint render_condition_mode, render_condition_mode_saved;
+   boolean render_condition_cond, render_condition_cond_saved;
+
+   struct pipe_clip_state clip;
+   struct pipe_clip_state clip_saved;
+
+   struct pipe_framebuffer_state fb, fb_saved;
+   struct pipe_viewport_state vp, vp_saved;
+   struct pipe_blend_color blend_color;
+   unsigned sample_mask, sample_mask_saved;
+   struct pipe_stencil_ref stencil_ref, stencil_ref_saved;
+};
+
+
+static boolean delete_blend_state(struct cso_context *ctx, void *state)
+{
+   struct cso_blend *cso = (struct cso_blend *)state;
+
+   if (ctx->blend == cso->data)
+      return FALSE;
+
+   if (cso->delete_state)
+      cso->delete_state(cso->context, cso->data);
+   FREE(state);
+   return TRUE;
+}
+
+static boolean delete_depth_stencil_state(struct cso_context *ctx, void *state)
+{
+   struct cso_depth_stencil_alpha *cso =
+      (struct cso_depth_stencil_alpha *)state;
+
+   if (ctx->depth_stencil == cso->data)
+      return FALSE;
+
+   if (cso->delete_state)
+      cso->delete_state(cso->context, cso->data);
+   FREE(state);
+
+   return TRUE;
+}
+
+static boolean delete_sampler_state(struct cso_context *ctx, void *state)
+{
+   struct cso_sampler *cso = (struct cso_sampler *)state;
+   if (cso->delete_state)
+      cso->delete_state(cso->context, cso->data);
+   FREE(state);
+   return TRUE;
+}
+
+static boolean delete_rasterizer_state(struct cso_context *ctx, void *state)
+{
+   struct cso_rasterizer *cso = (struct cso_rasterizer *)state;
+
+   if (ctx->rasterizer == cso->data)
+      return FALSE;
+   if (cso->delete_state)
+      cso->delete_state(cso->context, cso->data);
+   FREE(state);
+   return TRUE;
+}
+
+static boolean delete_vertex_elements(struct cso_context *ctx,
+                                      void *state)
+{
+   struct cso_velements *cso = (struct cso_velements *)state;
+
+   if (ctx->velements == cso->data)
+      return FALSE;
+
+   if (cso->delete_state)
+      cso->delete_state(cso->context, cso->data);
+   FREE(state);
+   return TRUE;
+}
+
+
+static INLINE boolean delete_cso(struct cso_context *ctx,
+                                 void *state, enum cso_cache_type type)
+{
+   switch (type) {
+   case CSO_BLEND:
+      return delete_blend_state(ctx, state);
+   case CSO_SAMPLER:
+      return delete_sampler_state(ctx, state);
+   case CSO_DEPTH_STENCIL_ALPHA:
+      return delete_depth_stencil_state(ctx, state);
+   case CSO_RASTERIZER:
+      return delete_rasterizer_state(ctx, state);
+   case CSO_VELEMENTS:
+      return delete_vertex_elements(ctx, state);
+   default:
+      assert(0);
+      FREE(state);
+   }
+   return FALSE;
+}
+
+static INLINE void
+sanitize_hash(struct cso_hash *hash, enum cso_cache_type type,
+              int max_size, void *user_data)
+{
+   struct cso_context *ctx = (struct cso_context *)user_data;
+   /* if we're approach the maximum size, remove fourth of the entries
+    * otherwise every subsequent call will go through the same */
+   int hash_size = cso_hash_size(hash);
+   int max_entries = (max_size > hash_size) ? max_size : hash_size;
+   int to_remove =  (max_size < max_entries) * max_entries/4;
+   struct cso_hash_iter iter = cso_hash_first_node(hash);
+   if (hash_size > max_size)
+      to_remove += hash_size - max_size;
+   while (to_remove) {
+      /*remove elements until we're good */
+      /*fixme: currently we pick the nodes to remove at random*/
+      void *cso = cso_hash_iter_data(iter);
+      if (delete_cso(ctx, cso, type)) {
+         iter = cso_hash_erase(hash, iter);
+         --to_remove;
+      } else
+         iter = cso_hash_iter_next(iter);
+   }
+}
+
+static void cso_init_vbuf(struct cso_context *cso)
+{
+   struct u_vbuf_caps caps;
+
+   u_vbuf_get_caps(cso->pipe->screen, &caps);
+
+   /* Install u_vbuf if there is anything unsupported. */
+   if (!caps.buffer_offset_unaligned ||
+       !caps.buffer_stride_unaligned ||
+       !caps.velem_src_offset_unaligned ||
+       !caps.format_fixed32 ||
+       !caps.format_float16 ||
+       !caps.format_float64 ||
+       !caps.format_norm32 ||
+       !caps.format_scaled32 ||
+       !caps.user_vertex_buffers) {
+      cso->vbuf = u_vbuf_create(cso->pipe, &caps,
+                                cso->aux_vertex_buffer_index);
+   }
+}
+
+struct cso_context *cso_create_context( struct pipe_context *pipe )
+{
+   struct cso_context *ctx = CALLOC_STRUCT(cso_context);
+   if (ctx == NULL)
+      goto out;
+
+   ctx->cache = cso_cache_create();
+   if (ctx->cache == NULL)
+      goto out;
+   cso_cache_set_sanitize_callback(ctx->cache,
+                                   sanitize_hash,
+                                   ctx);
+
+   ctx->pipe = pipe;
+   ctx->sample_mask_saved = ~0;
+
+   ctx->aux_vertex_buffer_index = 0; /* 0 for now */
+
+   cso_init_vbuf(ctx);
+
+   /* Enable for testing: */
+   if (0) cso_set_maximum_cache_size( ctx->cache, 4 );
+
+   if (pipe->screen->get_shader_param(pipe->screen, PIPE_SHADER_GEOMETRY,
+                                PIPE_SHADER_CAP_MAX_INSTRUCTIONS) > 0) {
+      ctx->has_geometry_shader = TRUE;
+   }
+   if (pipe->screen->get_param(pipe->screen,
+                               PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS) != 0) {
+      ctx->has_streamout = TRUE;
+   }
+
+   return ctx;
+
+out:
+   cso_destroy_context( ctx );
+   return NULL;
+}
+
+/**
+ * Prior to context destruction, this function unbinds all state objects.
+ */
+void cso_release_all( struct cso_context *ctx )
+{
+   unsigned i, shader;
+
+   if (ctx->pipe) {
+      ctx->pipe->bind_blend_state( ctx->pipe, NULL );
+      ctx->pipe->bind_rasterizer_state( ctx->pipe, NULL );
+      ctx->pipe->bind_fragment_sampler_states( ctx->pipe, 0, NULL );
+      if (ctx->pipe->bind_vertex_sampler_states)
+         ctx->pipe->bind_vertex_sampler_states(ctx->pipe, 0, NULL);
+      ctx->pipe->bind_depth_stencil_alpha_state( ctx->pipe, NULL );
+      ctx->pipe->bind_fs_state( ctx->pipe, NULL );
+      ctx->pipe->bind_vs_state( ctx->pipe, NULL );
+      ctx->pipe->bind_vertex_elements_state( ctx->pipe, NULL );
+      ctx->pipe->set_fragment_sampler_views(ctx->pipe, 0, NULL);
+      if (ctx->pipe->set_vertex_sampler_views)
+         ctx->pipe->set_vertex_sampler_views(ctx->pipe, 0, NULL);
+      if (ctx->pipe->set_stream_output_targets)
+         ctx->pipe->set_stream_output_targets(ctx->pipe, 0, NULL, 0);
+   }
+
+   /* free fragment samplers, views */
+   for (shader = 0; shader < Elements(ctx->samplers); shader++) {
+      struct sampler_info *info = &ctx->samplers[shader];
+      for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+         pipe_sampler_view_reference(&info->views[i], NULL);
+         pipe_sampler_view_reference(&info->views_saved[i], NULL);
+      }
+   }
+
+   util_unreference_framebuffer_state(&ctx->fb);
+   util_unreference_framebuffer_state(&ctx->fb_saved);
+
+   pipe_resource_reference(&ctx->aux_vertex_buffer_current.buffer, NULL);
+   pipe_resource_reference(&ctx->aux_vertex_buffer_saved.buffer, NULL);
+
+   for (i = 0; i < PIPE_SHADER_TYPES; i++) {
+      pipe_resource_reference(&ctx->aux_constbuf_current[i].buffer, NULL);
+      pipe_resource_reference(&ctx->aux_constbuf_saved[i].buffer, NULL);
+   }
+
+   for (i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
+      pipe_so_target_reference(&ctx->so_targets[i], NULL);
+      pipe_so_target_reference(&ctx->so_targets_saved[i], NULL);
+   }
+
+   if (ctx->cache) {
+      cso_cache_delete( ctx->cache );
+      ctx->cache = NULL;
+   }
+}
+
+
+/**
+ * Free the CSO context.  NOTE: the state tracker should have previously called
+ * cso_release_all().
+ */
+void cso_destroy_context( struct cso_context *ctx )
+{
+   if (ctx) {
+      if (ctx->vbuf)
+         u_vbuf_destroy(ctx->vbuf);
+      FREE( ctx );
+   }
+}
+
+
+/* Those function will either find the state of the given template
+ * in the cache or they will create a new state from the given
+ * template, insert it in the cache and return it.
+ */
+
+/*
+ * If the driver returns 0 from the create method then they will assign
+ * the data member of the cso to be the template itself.
+ */
+
+enum pipe_error cso_set_blend(struct cso_context *ctx,
+                              const struct pipe_blend_state *templ)
+{
+   unsigned key_size, hash_key;
+   struct cso_hash_iter iter;
+   void *handle;
+
+   key_size = templ->independent_blend_enable ?
+      sizeof(struct pipe_blend_state) :
+      (char *)&(templ->rt[1]) - (char *)templ;
+   hash_key = cso_construct_key((void*)templ, key_size);
+   iter = cso_find_state_template(ctx->cache, hash_key, CSO_BLEND,
+                                  (void*)templ, key_size);
+
+   if (cso_hash_iter_is_null(iter)) {
+      struct cso_blend *cso = MALLOC(sizeof(struct cso_blend));
+      if (!cso)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+
+      memset(&cso->state, 0, sizeof cso->state);
+      memcpy(&cso->state, templ, key_size);
+      cso->data = ctx->pipe->create_blend_state(ctx->pipe, &cso->state);
+      cso->delete_state = (cso_state_callback)ctx->pipe->delete_blend_state;
+      cso->context = ctx->pipe;
+
+      iter = cso_insert_state(ctx->cache, hash_key, CSO_BLEND, cso);
+      if (cso_hash_iter_is_null(iter)) {
+         FREE(cso);
+         return PIPE_ERROR_OUT_OF_MEMORY;
+      }
+
+      handle = cso->data;
+   }
+   else {
+      handle = ((struct cso_blend *)cso_hash_iter_data(iter))->data;
+   }
+
+   if (ctx->blend != handle) {
+      ctx->blend = handle;
+      ctx->pipe->bind_blend_state(ctx->pipe, handle);
+   }
+   return PIPE_OK;
+}
+
+void cso_save_blend(struct cso_context *ctx)
+{
+   assert(!ctx->blend_saved);
+   ctx->blend_saved = ctx->blend;
+}
+
+void cso_restore_blend(struct cso_context *ctx)
+{
+   if (ctx->blend != ctx->blend_saved) {
+      ctx->blend = ctx->blend_saved;
+      ctx->pipe->bind_blend_state(ctx->pipe, ctx->blend_saved);
+   }
+   ctx->blend_saved = NULL;
+}
+
+
+
+enum pipe_error
+cso_set_depth_stencil_alpha(struct cso_context *ctx,
+                            const struct pipe_depth_stencil_alpha_state *templ)
+{
+   unsigned key_size = sizeof(struct pipe_depth_stencil_alpha_state);
+   unsigned hash_key = cso_construct_key((void*)templ, key_size);
+   struct cso_hash_iter iter = cso_find_state_template(ctx->cache,
+                                                       hash_key,
+                                                       CSO_DEPTH_STENCIL_ALPHA,
+                                                       (void*)templ, key_size);
+   void *handle;
+
+   if (cso_hash_iter_is_null(iter)) {
+      struct cso_depth_stencil_alpha *cso =
+         MALLOC(sizeof(struct cso_depth_stencil_alpha));
+      if (!cso)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+
+      memcpy(&cso->state, templ, sizeof(*templ));
+      cso->data = ctx->pipe->create_depth_stencil_alpha_state(ctx->pipe,
+                                                              &cso->state);
+      cso->delete_state =
+         (cso_state_callback)ctx->pipe->delete_depth_stencil_alpha_state;
+      cso->context = ctx->pipe;
+
+      iter = cso_insert_state(ctx->cache, hash_key,
+                              CSO_DEPTH_STENCIL_ALPHA, cso);
+      if (cso_hash_iter_is_null(iter)) {
+         FREE(cso);
+         return PIPE_ERROR_OUT_OF_MEMORY;
+      }
+
+      handle = cso->data;
+   }
+   else {
+      handle = ((struct cso_depth_stencil_alpha *)
+                cso_hash_iter_data(iter))->data;
+   }
+
+   if (ctx->depth_stencil != handle) {
+      ctx->depth_stencil = handle;
+      ctx->pipe->bind_depth_stencil_alpha_state(ctx->pipe, handle);
+   }
+   return PIPE_OK;
+}
+
+void cso_save_depth_stencil_alpha(struct cso_context *ctx)
+{
+   assert(!ctx->depth_stencil_saved);
+   ctx->depth_stencil_saved = ctx->depth_stencil;
+}
+
+void cso_restore_depth_stencil_alpha(struct cso_context *ctx)
+{
+   if (ctx->depth_stencil != ctx->depth_stencil_saved) {
+      ctx->depth_stencil = ctx->depth_stencil_saved;
+      ctx->pipe->bind_depth_stencil_alpha_state(ctx->pipe,
+                                                ctx->depth_stencil_saved);
+   }
+   ctx->depth_stencil_saved = NULL;
+}
+
+
+
+enum pipe_error cso_set_rasterizer(struct cso_context *ctx,
+                                   const struct pipe_rasterizer_state *templ)
+{
+   unsigned key_size = sizeof(struct pipe_rasterizer_state);
+   unsigned hash_key = cso_construct_key((void*)templ, key_size);
+   struct cso_hash_iter iter = cso_find_state_template(ctx->cache,
+                                                       hash_key,
+                                                       CSO_RASTERIZER,
+                                                       (void*)templ, key_size);
+   void *handle = NULL;
+
+   if (cso_hash_iter_is_null(iter)) {
+      struct cso_rasterizer *cso = MALLOC(sizeof(struct cso_rasterizer));
+      if (!cso)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+
+      memcpy(&cso->state, templ, sizeof(*templ));
+      cso->data = ctx->pipe->create_rasterizer_state(ctx->pipe, &cso->state);
+      cso->delete_state =
+         (cso_state_callback)ctx->pipe->delete_rasterizer_state;
+      cso->context = ctx->pipe;
+
+      iter = cso_insert_state(ctx->cache, hash_key, CSO_RASTERIZER, cso);
+      if (cso_hash_iter_is_null(iter)) {
+         FREE(cso);
+         return PIPE_ERROR_OUT_OF_MEMORY;
+      }
+
+      handle = cso->data;
+   }
+   else {
+      handle = ((struct cso_rasterizer *)cso_hash_iter_data(iter))->data;
+   }
+
+   if (ctx->rasterizer != handle) {
+      ctx->rasterizer = handle;
+      ctx->pipe->bind_rasterizer_state(ctx->pipe, handle);
+   }
+   return PIPE_OK;
+}
+
+void cso_save_rasterizer(struct cso_context *ctx)
+{
+   assert(!ctx->rasterizer_saved);
+   ctx->rasterizer_saved = ctx->rasterizer;
+}
+
+void cso_restore_rasterizer(struct cso_context *ctx)
+{
+   if (ctx->rasterizer != ctx->rasterizer_saved) {
+      ctx->rasterizer = ctx->rasterizer_saved;
+      ctx->pipe->bind_rasterizer_state(ctx->pipe, ctx->rasterizer_saved);
+   }
+   ctx->rasterizer_saved = NULL;
+}
+
+
+void cso_set_fragment_shader_handle(struct cso_context *ctx, void *handle )
+{
+   if (ctx->fragment_shader != handle) {
+      ctx->fragment_shader = handle;
+      ctx->pipe->bind_fs_state(ctx->pipe, handle);
+   }
+}
+
+void cso_delete_fragment_shader(struct cso_context *ctx, void *handle )
+{
+   if (handle == ctx->fragment_shader) {
+      /* unbind before deleting */
+      ctx->pipe->bind_fs_state(ctx->pipe, NULL);
+      ctx->fragment_shader = NULL;
+   }
+   ctx->pipe->delete_fs_state(ctx->pipe, handle);
+}
+
+void cso_save_fragment_shader(struct cso_context *ctx)
+{
+   assert(!ctx->fragment_shader_saved);
+   ctx->fragment_shader_saved = ctx->fragment_shader;
+}
+
+void cso_restore_fragment_shader(struct cso_context *ctx)
+{
+   if (ctx->fragment_shader_saved != ctx->fragment_shader) {
+      ctx->pipe->bind_fs_state(ctx->pipe, ctx->fragment_shader_saved);
+      ctx->fragment_shader = ctx->fragment_shader_saved;
+   }
+   ctx->fragment_shader_saved = NULL;
+}
+
+
+void cso_set_vertex_shader_handle(struct cso_context *ctx, void *handle)
+{
+   if (ctx->vertex_shader != handle) {
+      ctx->vertex_shader = handle;
+      ctx->pipe->bind_vs_state(ctx->pipe, handle);
+   }
+}
+
+void cso_delete_vertex_shader(struct cso_context *ctx, void *handle )
+{
+   if (handle == ctx->vertex_shader) {
+      /* unbind before deleting */
+      ctx->pipe->bind_vs_state(ctx->pipe, NULL);
+      ctx->vertex_shader = NULL;
+   }
+   ctx->pipe->delete_vs_state(ctx->pipe, handle);
+}
+
+void cso_save_vertex_shader(struct cso_context *ctx)
+{
+   assert(!ctx->vertex_shader_saved);
+   ctx->vertex_shader_saved = ctx->vertex_shader;
+}
+
+void cso_restore_vertex_shader(struct cso_context *ctx)
+{
+   if (ctx->vertex_shader_saved != ctx->vertex_shader) {
+      ctx->pipe->bind_vs_state(ctx->pipe, ctx->vertex_shader_saved);
+      ctx->vertex_shader = ctx->vertex_shader_saved;
+   }
+   ctx->vertex_shader_saved = NULL;
+}
+
+
+void cso_set_framebuffer(struct cso_context *ctx,
+                         const struct pipe_framebuffer_state *fb)
+{
+   if (memcmp(&ctx->fb, fb, sizeof(*fb)) != 0) {
+      util_copy_framebuffer_state(&ctx->fb, fb);
+      ctx->pipe->set_framebuffer_state(ctx->pipe, fb);
+   }
+}
+
+void cso_save_framebuffer(struct cso_context *ctx)
+{
+   util_copy_framebuffer_state(&ctx->fb_saved, &ctx->fb);
+}
+
+void cso_restore_framebuffer(struct cso_context *ctx)
+{
+   if (memcmp(&ctx->fb, &ctx->fb_saved, sizeof(ctx->fb))) {
+      util_copy_framebuffer_state(&ctx->fb, &ctx->fb_saved);
+      ctx->pipe->set_framebuffer_state(ctx->pipe, &ctx->fb);
+      util_unreference_framebuffer_state(&ctx->fb_saved);
+   }
+}
+
+
+void cso_set_viewport(struct cso_context *ctx,
+                      const struct pipe_viewport_state *vp)
+{
+   if (memcmp(&ctx->vp, vp, sizeof(*vp))) {
+      ctx->vp = *vp;
+      ctx->pipe->set_viewport_states(ctx->pipe, 0, 1, vp);
+   }
+}
+
+void cso_save_viewport(struct cso_context *ctx)
+{
+   ctx->vp_saved = ctx->vp;
+}
+
+
+void cso_restore_viewport(struct cso_context *ctx)
+{
+   if (memcmp(&ctx->vp, &ctx->vp_saved, sizeof(ctx->vp))) {
+      ctx->vp = ctx->vp_saved;
+      ctx->pipe->set_viewport_states(ctx->pipe, 0, 1, &ctx->vp);
+   }
+}
+
+
+void cso_set_blend_color(struct cso_context *ctx,
+                         const struct pipe_blend_color *bc)
+{
+   if (memcmp(&ctx->blend_color, bc, sizeof(ctx->blend_color))) {
+      ctx->blend_color = *bc;
+      ctx->pipe->set_blend_color(ctx->pipe, bc);
+   }
+}
+
+void cso_set_sample_mask(struct cso_context *ctx, unsigned sample_mask)
+{
+   if (ctx->sample_mask != sample_mask) {
+      ctx->sample_mask = sample_mask;
+      ctx->pipe->set_sample_mask(ctx->pipe, sample_mask);
+   }
+}
+
+void cso_save_sample_mask(struct cso_context *ctx)
+{
+   ctx->sample_mask_saved = ctx->sample_mask;
+}
+
+void cso_restore_sample_mask(struct cso_context *ctx)
+{
+   cso_set_sample_mask(ctx, ctx->sample_mask_saved);
+}
+
+void cso_set_stencil_ref(struct cso_context *ctx,
+                         const struct pipe_stencil_ref *sr)
+{
+   if (memcmp(&ctx->stencil_ref, sr, sizeof(ctx->stencil_ref))) {
+      ctx->stencil_ref = *sr;
+      ctx->pipe->set_stencil_ref(ctx->pipe, sr);
+   }
+}
+
+void cso_save_stencil_ref(struct cso_context *ctx)
+{
+   ctx->stencil_ref_saved = ctx->stencil_ref;
+}
+
+
+void cso_restore_stencil_ref(struct cso_context *ctx)
+{
+   if (memcmp(&ctx->stencil_ref, &ctx->stencil_ref_saved,
+              sizeof(ctx->stencil_ref))) {
+      ctx->stencil_ref = ctx->stencil_ref_saved;
+      ctx->pipe->set_stencil_ref(ctx->pipe, &ctx->stencil_ref);
+   }
+}
+
+void cso_set_render_condition(struct cso_context *ctx,
+                              struct pipe_query *query,
+                              boolean condition, uint mode)
+{
+   struct pipe_context *pipe = ctx->pipe;
+
+   if (ctx->render_condition != query ||
+       ctx->render_condition_mode != mode ||
+       ctx->render_condition_cond != condition) {
+      pipe->render_condition(pipe, query, condition, mode);
+      ctx->render_condition = query;
+      ctx->render_condition_cond = condition;
+      ctx->render_condition_mode = mode;
+   }
+}
+
+void cso_save_render_condition(struct cso_context *ctx)
+{
+   ctx->render_condition_saved = ctx->render_condition;
+   ctx->render_condition_cond_saved = ctx->render_condition_cond;
+   ctx->render_condition_mode_saved = ctx->render_condition_mode;
+}
+
+void cso_restore_render_condition(struct cso_context *ctx)
+{
+   cso_set_render_condition(ctx, ctx->render_condition_saved,
+                            ctx->render_condition_cond_saved,
+                            ctx->render_condition_mode_saved);
+}
+
+void cso_set_geometry_shader_handle(struct cso_context *ctx, void *handle)
+{
+   assert(ctx->has_geometry_shader || !handle);
+
+   if (ctx->has_geometry_shader && ctx->geometry_shader != handle) {
+      ctx->geometry_shader = handle;
+      ctx->pipe->bind_gs_state(ctx->pipe, handle);
+   }
+}
+
+void cso_delete_geometry_shader(struct cso_context *ctx, void *handle)
+{
+    if (handle == ctx->geometry_shader) {
+      /* unbind before deleting */
+      ctx->pipe->bind_gs_state(ctx->pipe, NULL);
+      ctx->geometry_shader = NULL;
+   }
+   ctx->pipe->delete_gs_state(ctx->pipe, handle);
+}
+
+void cso_save_geometry_shader(struct cso_context *ctx)
+{
+   if (!ctx->has_geometry_shader) {
+      return;
+   }
+
+   assert(!ctx->geometry_shader_saved);
+   ctx->geometry_shader_saved = ctx->geometry_shader;
+}
+
+void cso_restore_geometry_shader(struct cso_context *ctx)
+{
+   if (!ctx->has_geometry_shader) {
+      return;
+   }
+
+   if (ctx->geometry_shader_saved != ctx->geometry_shader) {
+      ctx->pipe->bind_gs_state(ctx->pipe, ctx->geometry_shader_saved);
+      ctx->geometry_shader = ctx->geometry_shader_saved;
+   }
+   ctx->geometry_shader_saved = NULL;
+}
+
+/* clip state */
+
+static INLINE void
+clip_state_cpy(struct pipe_clip_state *dst,
+               const struct pipe_clip_state *src)
+{
+   memcpy(dst->ucp, src->ucp, sizeof(dst->ucp));
+}
+
+static INLINE int
+clip_state_cmp(const struct pipe_clip_state *a,
+               const struct pipe_clip_state *b)
+{
+   return memcmp(a->ucp, b->ucp, sizeof(a->ucp));
+}
+
+void
+cso_set_clip(struct cso_context *ctx,
+             const struct pipe_clip_state *clip)
+{
+   if (clip_state_cmp(&ctx->clip, clip)) {
+      clip_state_cpy(&ctx->clip, clip);
+      ctx->pipe->set_clip_state(ctx->pipe, clip);
+   }
+}
+
+void
+cso_save_clip(struct cso_context *ctx)
+{
+   clip_state_cpy(&ctx->clip_saved, &ctx->clip);
+}
+
+void
+cso_restore_clip(struct cso_context *ctx)
+{
+   if (clip_state_cmp(&ctx->clip, &ctx->clip_saved)) {
+      clip_state_cpy(&ctx->clip, &ctx->clip_saved);
+      ctx->pipe->set_clip_state(ctx->pipe, &ctx->clip_saved);
+   }
+}
+
+enum pipe_error
+cso_set_vertex_elements(struct cso_context *ctx,
+                        unsigned count,
+                        const struct pipe_vertex_element *states)
+{
+   struct u_vbuf *vbuf = ctx->vbuf;
+   unsigned key_size, hash_key;
+   struct cso_hash_iter iter;
+   void *handle;
+   struct cso_velems_state velems_state;
+
+   if (vbuf) {
+      u_vbuf_set_vertex_elements(vbuf, count, states);
+      return PIPE_OK;
+   }
+
+   /* Need to include the count into the stored state data too.
+    * Otherwise first few count pipe_vertex_elements could be identical
+    * even if count is different, and there's no guarantee the hash would
+    * be different in that case neither.
+    */
+   key_size = sizeof(struct pipe_vertex_element) * count + sizeof(unsigned);
+   velems_state.count = count;
+   memcpy(velems_state.velems, states,
+          sizeof(struct pipe_vertex_element) * count);
+   hash_key = cso_construct_key((void*)&velems_state, key_size);
+   iter = cso_find_state_template(ctx->cache, hash_key, CSO_VELEMENTS,
+                                  (void*)&velems_state, key_size);
+
+   if (cso_hash_iter_is_null(iter)) {
+      struct cso_velements *cso = MALLOC(sizeof(struct cso_velements));
+      if (!cso)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+
+      memcpy(&cso->state, &velems_state, key_size);
+      cso->data = ctx->pipe->create_vertex_elements_state(ctx->pipe, count,
+                                                      &cso->state.velems[0]);
+      cso->delete_state =
+         (cso_state_callback) ctx->pipe->delete_vertex_elements_state;
+      cso->context = ctx->pipe;
+
+      iter = cso_insert_state(ctx->cache, hash_key, CSO_VELEMENTS, cso);
+      if (cso_hash_iter_is_null(iter)) {
+         FREE(cso);
+         return PIPE_ERROR_OUT_OF_MEMORY;
+      }
+
+      handle = cso->data;
+   }
+   else {
+      handle = ((struct cso_velements *)cso_hash_iter_data(iter))->data;
+   }
+
+   if (ctx->velements != handle) {
+      ctx->velements = handle;
+      ctx->pipe->bind_vertex_elements_state(ctx->pipe, handle);
+   }
+   return PIPE_OK;
+}
+
+void cso_save_vertex_elements(struct cso_context *ctx)
+{
+   struct u_vbuf *vbuf = ctx->vbuf;
+
+   if (vbuf) {
+      u_vbuf_save_vertex_elements(vbuf);
+      return;
+   }
+
+   assert(!ctx->velements_saved);
+   ctx->velements_saved = ctx->velements;
+}
+
+void cso_restore_vertex_elements(struct cso_context *ctx)
+{
+   struct u_vbuf *vbuf = ctx->vbuf;
+
+   if (vbuf) {
+      u_vbuf_restore_vertex_elements(vbuf);
+      return;
+   }
+
+   if (ctx->velements != ctx->velements_saved) {
+      ctx->velements = ctx->velements_saved;
+      ctx->pipe->bind_vertex_elements_state(ctx->pipe, ctx->velements_saved);
+   }
+   ctx->velements_saved = NULL;
+}
+
+/* vertex buffers */
+
+void cso_set_vertex_buffers(struct cso_context *ctx,
+                            unsigned start_slot, unsigned count,
+                            const struct pipe_vertex_buffer *buffers)
+{
+   struct u_vbuf *vbuf = ctx->vbuf;
+
+   if (vbuf) {
+      u_vbuf_set_vertex_buffers(vbuf, start_slot, count, buffers);
+      return;
+   }
+
+   /* Save what's in the auxiliary slot, so that we can save and restore it
+    * for meta ops. */
+   if (start_slot <= ctx->aux_vertex_buffer_index &&
+       start_slot+count > ctx->aux_vertex_buffer_index) {
+      if (buffers) {
+         const struct pipe_vertex_buffer *vb =
+               buffers + (ctx->aux_vertex_buffer_index - start_slot);
+
+         pipe_resource_reference(&ctx->aux_vertex_buffer_current.buffer,
+                                 vb->buffer);
+         memcpy(&ctx->aux_vertex_buffer_current, vb,
+                sizeof(struct pipe_vertex_buffer));
+      }
+      else {
+         pipe_resource_reference(&ctx->aux_vertex_buffer_current.buffer,
+                                 NULL);
+         ctx->aux_vertex_buffer_current.user_buffer = NULL;
+      }
+   }
+
+   ctx->pipe->set_vertex_buffers(ctx->pipe, start_slot, count, buffers);
+}
+
+void cso_save_aux_vertex_buffer_slot(struct cso_context *ctx)
+{
+   struct u_vbuf *vbuf = ctx->vbuf;
+
+   if (vbuf) {
+      u_vbuf_save_aux_vertex_buffer_slot(vbuf);
+      return;
+   }
+
+   pipe_resource_reference(&ctx->aux_vertex_buffer_saved.buffer,
+                           ctx->aux_vertex_buffer_current.buffer);
+   memcpy(&ctx->aux_vertex_buffer_saved, &ctx->aux_vertex_buffer_current,
+          sizeof(struct pipe_vertex_buffer));
+}
+
+void cso_restore_aux_vertex_buffer_slot(struct cso_context *ctx)
+{
+   struct u_vbuf *vbuf = ctx->vbuf;
+
+   if (vbuf) {
+      u_vbuf_restore_aux_vertex_buffer_slot(vbuf);
+      return;
+   }
+
+   cso_set_vertex_buffers(ctx, ctx->aux_vertex_buffer_index, 1,
+                          &ctx->aux_vertex_buffer_saved);
+   pipe_resource_reference(&ctx->aux_vertex_buffer_saved.buffer, NULL);
+}
+
+unsigned cso_get_aux_vertex_buffer_slot(struct cso_context *ctx)
+{
+   return ctx->aux_vertex_buffer_index;
+}
+
+
+/**************** fragment/vertex sampler view state *************************/
+
+static enum pipe_error
+single_sampler(struct cso_context *ctx,
+               struct sampler_info *info,
+               unsigned idx,
+               const struct pipe_sampler_state *templ)
+{
+   void *handle = NULL;
+
+   if (templ != NULL) {
+      unsigned key_size = sizeof(struct pipe_sampler_state);
+      unsigned hash_key = cso_construct_key((void*)templ, key_size);
+      struct cso_hash_iter iter =
+         cso_find_state_template(ctx->cache,
+                                 hash_key, CSO_SAMPLER,
+                                 (void *) templ, key_size);
+
+      if (cso_hash_iter_is_null(iter)) {
+         struct cso_sampler *cso = MALLOC(sizeof(struct cso_sampler));
+         if (!cso)
+            return PIPE_ERROR_OUT_OF_MEMORY;
+
+         memcpy(&cso->state, templ, sizeof(*templ));
+         cso->data = ctx->pipe->create_sampler_state(ctx->pipe, &cso->state);
+         cso->delete_state =
+            (cso_state_callback) ctx->pipe->delete_sampler_state;
+         cso->context = ctx->pipe;
+
+         iter = cso_insert_state(ctx->cache, hash_key, CSO_SAMPLER, cso);
+         if (cso_hash_iter_is_null(iter)) {
+            FREE(cso);
+            return PIPE_ERROR_OUT_OF_MEMORY;
+         }
+
+         handle = cso->data;
+      }
+      else {
+         handle = ((struct cso_sampler *)cso_hash_iter_data(iter))->data;
+      }
+   }
+
+   info->samplers[idx] = handle;
+
+   return PIPE_OK;
+}
+
+enum pipe_error
+cso_single_sampler(struct cso_context *ctx,
+                   unsigned shader_stage,
+                   unsigned idx,
+                   const struct pipe_sampler_state *templ)
+{
+   return single_sampler(ctx, &ctx->samplers[shader_stage], idx, templ);
+}
+
+
+
+static void
+single_sampler_done(struct cso_context *ctx, unsigned shader_stage)
+{
+   struct sampler_info *info = &ctx->samplers[shader_stage];
+   unsigned i;
+
+   /* find highest non-null sampler */
+   for (i = PIPE_MAX_SAMPLERS; i > 0; i--) {
+      if (info->samplers[i - 1] != NULL)
+         break;
+   }
+
+   info->nr_samplers = i;
+
+   if (info->hw.nr_samplers != info->nr_samplers ||
+       memcmp(info->hw.samplers,
+              info->samplers,
+              info->nr_samplers * sizeof(void *)) != 0)
+   {
+      memcpy(info->hw.samplers,
+             info->samplers,
+             info->nr_samplers * sizeof(void *));
+      info->hw.nr_samplers = info->nr_samplers;
+
+      switch (shader_stage) {
+      case PIPE_SHADER_FRAGMENT:
+         ctx->pipe->bind_fragment_sampler_states(ctx->pipe,
+                                                 info->nr_samplers,
+                                                 info->samplers);
+         break;
+      case PIPE_SHADER_VERTEX:
+         ctx->pipe->bind_vertex_sampler_states(ctx->pipe,
+                                               info->nr_samplers,
+                                               info->samplers);
+         break;
+      case PIPE_SHADER_GEOMETRY:
+         ctx->pipe->bind_geometry_sampler_states(ctx->pipe,
+                                               info->nr_samplers,
+                                               info->samplers);
+         break;
+      default:
+         assert(!"bad shader type in single_sampler_done()");
+      }
+   }
+}
+
+void
+cso_single_sampler_done(struct cso_context *ctx, unsigned shader_stage)
+{
+   single_sampler_done(ctx, shader_stage);
+}
+
+
+/*
+ * If the function encouters any errors it will return the
+ * last one. Done to always try to set as many samplers
+ * as possible.
+ */
+enum pipe_error
+cso_set_samplers(struct cso_context *ctx,
+                 unsigned shader_stage,
+                 unsigned nr,
+                 const struct pipe_sampler_state **templates)
+{
+   struct sampler_info *info = &ctx->samplers[shader_stage];
+   unsigned i;
+   enum pipe_error temp, error = PIPE_OK;
+
+   /* TODO: fastpath
+    */
+
+   for (i = 0; i < nr; i++) {
+      temp = single_sampler(ctx, info, i, templates[i]);
+      if (temp != PIPE_OK)
+         error = temp;
+   }
+
+   for ( ; i < info->nr_samplers; i++) {
+      temp = single_sampler(ctx, info, i, NULL);
+      if (temp != PIPE_OK)
+         error = temp;
+   }
+
+   single_sampler_done(ctx, shader_stage);
+
+   return error;
+}
+
+void
+cso_save_samplers(struct cso_context *ctx, unsigned shader_stage)
+{
+   struct sampler_info *info = &ctx->samplers[shader_stage];
+   info->nr_samplers_saved = info->nr_samplers;
+   memcpy(info->samplers_saved, info->samplers, sizeof(info->samplers));
+}
+
+
+void
+cso_restore_samplers(struct cso_context *ctx, unsigned shader_stage)
+{
+   struct sampler_info *info = &ctx->samplers[shader_stage];
+   info->nr_samplers = info->nr_samplers_saved;
+   memcpy(info->samplers, info->samplers_saved, sizeof(info->samplers));
+   single_sampler_done(ctx, shader_stage);
+}
+
+
+void
+cso_set_sampler_views(struct cso_context *ctx,
+                      unsigned shader_stage,
+                      unsigned count,
+                      struct pipe_sampler_view **views)
+{
+   struct sampler_info *info = &ctx->samplers[shader_stage];
+   unsigned i;
+
+   /* reference new views */
+   for (i = 0; i < count; i++) {
+      pipe_sampler_view_reference(&info->views[i], views[i]);
+   }
+   /* unref extra old views, if any */
+   for (; i < info->nr_views; i++) {
+      pipe_sampler_view_reference(&info->views[i], NULL);
+   }
+
+   info->nr_views = count;
+
+   /* bind the new sampler views */
+   switch (shader_stage) {
+   case PIPE_SHADER_FRAGMENT:
+      ctx->pipe->set_fragment_sampler_views(ctx->pipe, count, info->views);
+      break;
+   case PIPE_SHADER_VERTEX:
+      ctx->pipe->set_vertex_sampler_views(ctx->pipe, count, info->views);
+      break;
+   case PIPE_SHADER_GEOMETRY:
+      ctx->pipe->set_geometry_sampler_views(ctx->pipe, count, info->views);
+      break;
+   default:
+      assert(!"bad shader type in cso_set_sampler_views()");
+   }
+}
+
+
+void
+cso_save_sampler_views(struct cso_context *ctx, unsigned shader_stage)
+{
+   struct sampler_info *info = &ctx->samplers[shader_stage];
+   unsigned i;
+
+   info->nr_views_saved = info->nr_views;
+
+   for (i = 0; i < info->nr_views; i++) {
+      assert(!info->views_saved[i]);
+      pipe_sampler_view_reference(&info->views_saved[i], info->views[i]);
+   }
+}
+
+
+void
+cso_restore_sampler_views(struct cso_context *ctx, unsigned shader_stage)
+{
+   struct sampler_info *info = &ctx->samplers[shader_stage];
+   unsigned i, nr_saved = info->nr_views_saved;
+
+   for (i = 0; i < nr_saved; i++) {
+      pipe_sampler_view_reference(&info->views[i], NULL);
+      /* move the reference from one pointer to another */
+      info->views[i] = info->views_saved[i];
+      info->views_saved[i] = NULL;
+   }
+   for (; i < info->nr_views; i++) {
+      pipe_sampler_view_reference(&info->views[i], NULL);
+   }
+
+   /* bind the old/saved sampler views */
+   switch (shader_stage) {
+   case PIPE_SHADER_FRAGMENT:
+      ctx->pipe->set_fragment_sampler_views(ctx->pipe, nr_saved, info->views);
+      break;
+   case PIPE_SHADER_VERTEX:
+      ctx->pipe->set_vertex_sampler_views(ctx->pipe, nr_saved, info->views);
+      break;
+   case PIPE_SHADER_GEOMETRY:
+      ctx->pipe->set_geometry_sampler_views(ctx->pipe, nr_saved, info->views);
+      break;
+   default:
+      assert(!"bad shader type in cso_restore_sampler_views()");
+   }
+
+   info->nr_views = nr_saved;
+   info->nr_views_saved = 0;
+}
+
+
+void
+cso_set_stream_outputs(struct cso_context *ctx,
+                       unsigned num_targets,
+                       struct pipe_stream_output_target **targets,
+                       unsigned append_bitmask)
+{
+   struct pipe_context *pipe = ctx->pipe;
+   uint i;
+
+   if (!ctx->has_streamout) {
+      assert(num_targets == 0);
+      return;
+   }
+
+   if (ctx->nr_so_targets == 0 && num_targets == 0) {
+      /* Nothing to do. */
+      return;
+   }
+
+   /* reference new targets */
+   for (i = 0; i < num_targets; i++) {
+      pipe_so_target_reference(&ctx->so_targets[i], targets[i]);
+   }
+   /* unref extra old targets, if any */
+   for (; i < ctx->nr_so_targets; i++) {
+      pipe_so_target_reference(&ctx->so_targets[i], NULL);
+   }
+
+   pipe->set_stream_output_targets(pipe, num_targets, targets,
+                                   append_bitmask);
+   ctx->nr_so_targets = num_targets;
+}
+
+void
+cso_save_stream_outputs(struct cso_context *ctx)
+{
+   uint i;
+
+   if (!ctx->has_streamout) {
+      return;
+   }
+
+   ctx->nr_so_targets_saved = ctx->nr_so_targets;
+
+   for (i = 0; i < ctx->nr_so_targets; i++) {
+      assert(!ctx->so_targets_saved[i]);
+      pipe_so_target_reference(&ctx->so_targets_saved[i], ctx->so_targets[i]);
+   }
+}
+
+void
+cso_restore_stream_outputs(struct cso_context *ctx)
+{
+   struct pipe_context *pipe = ctx->pipe;
+   uint i;
+
+   if (!ctx->has_streamout) {
+      return;
+   }
+
+   if (ctx->nr_so_targets == 0 && ctx->nr_so_targets_saved == 0) {
+      /* Nothing to do. */
+      return;
+   }
+
+   for (i = 0; i < ctx->nr_so_targets_saved; i++) {
+      pipe_so_target_reference(&ctx->so_targets[i], NULL);
+      /* move the reference from one pointer to another */
+      ctx->so_targets[i] = ctx->so_targets_saved[i];
+      ctx->so_targets_saved[i] = NULL;
+   }
+   for (; i < ctx->nr_so_targets; i++) {
+      pipe_so_target_reference(&ctx->so_targets[i], NULL);
+   }
+
+   /* ~0 means append */
+   pipe->set_stream_output_targets(pipe, ctx->nr_so_targets_saved,
+                                   ctx->so_targets, ~0);
+
+   ctx->nr_so_targets = ctx->nr_so_targets_saved;
+   ctx->nr_so_targets_saved = 0;
+}
+
+/* constant buffers */
+
+void
+cso_set_constant_buffer(struct cso_context *cso, unsigned shader_stage,
+                        unsigned index, struct pipe_constant_buffer *cb)
+{
+   struct pipe_context *pipe = cso->pipe;
+
+   pipe->set_constant_buffer(pipe, shader_stage, index, cb);
+
+   if (index == 0) {
+      util_copy_constant_buffer(&cso->aux_constbuf_current[shader_stage], cb);
+   }
+}
+
+void
+cso_set_constant_buffer_resource(struct cso_context *cso,
+                                 unsigned shader_stage,
+                                 unsigned index,
+                                 struct pipe_resource *buffer)
+{
+   if (buffer) {
+      struct pipe_constant_buffer cb;
+      cb.buffer = buffer;
+      cb.buffer_offset = 0;
+      cb.buffer_size = buffer->width0;
+      cb.user_buffer = NULL;
+      cso_set_constant_buffer(cso, shader_stage, index, &cb);
+   } else {
+      cso_set_constant_buffer(cso, shader_stage, index, NULL);
+   }
+}
+
+void
+cso_save_constant_buffer_slot0(struct cso_context *cso,
+                                  unsigned shader_stage)
+{
+   util_copy_constant_buffer(&cso->aux_constbuf_saved[shader_stage],
+                             &cso->aux_constbuf_current[shader_stage]);
+}
+
+void
+cso_restore_constant_buffer_slot0(struct cso_context *cso,
+                                     unsigned shader_stage)
+{
+   cso_set_constant_buffer(cso, shader_stage, 0,
+                           &cso->aux_constbuf_saved[shader_stage]);
+   pipe_resource_reference(&cso->aux_constbuf_saved[shader_stage].buffer,
+                           NULL);
+}
+
+/* drawing */
+
+void
+cso_set_index_buffer(struct cso_context *cso,
+                     const struct pipe_index_buffer *ib)
+{
+   struct u_vbuf *vbuf = cso->vbuf;
+
+   if (vbuf) {
+      u_vbuf_set_index_buffer(vbuf, ib);
+   } else {
+      struct pipe_context *pipe = cso->pipe;
+      pipe->set_index_buffer(pipe, ib);
+   }
+}
+
+void
+cso_draw_vbo(struct cso_context *cso,
+             const struct pipe_draw_info *info)
+{
+   struct u_vbuf *vbuf = cso->vbuf;
+
+   if (vbuf) {
+      u_vbuf_draw_vbo(vbuf, info);
+   } else {
+      struct pipe_context *pipe = cso->pipe;
+      pipe->draw_vbo(pipe, info);
+   }
+}
+
+void
+cso_draw_arrays(struct cso_context *cso, uint mode, uint start, uint count)
+{
+   struct pipe_draw_info info;
+
+   util_draw_init_info(&info);
+
+   info.mode = mode;
+   info.start = start;
+   info.count = count;
+   info.min_index = start;
+   info.max_index = start + count - 1;
+
+   cso_draw_vbo(cso, &info);
+}
diff --git a/drivers/video/Gallium/auxiliary/cso_cache/cso_context.h b/drivers/video/Gallium/auxiliary/cso_cache/cso_context.h
new file mode 100644
index 0000000000..82c8e18def
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/cso_cache/cso_context.h
@@ -0,0 +1,239 @@
+/**************************************************************************
+ *
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef CSO_CONTEXT_H
+#define CSO_CONTEXT_H
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct cso_context;
+struct u_vbuf;
+
+struct cso_context *cso_create_context( struct pipe_context *pipe );
+
+void cso_release_all( struct cso_context *ctx );
+
+void cso_destroy_context( struct cso_context *cso );
+
+
+
+enum pipe_error cso_set_blend( struct cso_context *cso,
+                               const struct pipe_blend_state *blend );
+void cso_save_blend(struct cso_context *cso);
+void cso_restore_blend(struct cso_context *cso);
+
+
+
+enum pipe_error cso_set_depth_stencil_alpha( struct cso_context *cso,
+                                             const struct pipe_depth_stencil_alpha_state *dsa );
+void cso_save_depth_stencil_alpha(struct cso_context *cso);
+void cso_restore_depth_stencil_alpha(struct cso_context *cso);
+
+
+
+enum pipe_error cso_set_rasterizer( struct cso_context *cso,
+                                    const struct pipe_rasterizer_state *rasterizer );
+void cso_save_rasterizer(struct cso_context *cso);
+void cso_restore_rasterizer(struct cso_context *cso);
+
+
+enum pipe_error
+cso_set_samplers(struct cso_context *cso,
+                 unsigned shader_stage,
+                 unsigned count,
+                 const struct pipe_sampler_state **states);
+
+void
+cso_save_samplers(struct cso_context *cso, unsigned shader_stage);
+
+void
+cso_restore_samplers(struct cso_context *cso, unsigned shader_stage);
+
+/* Alternate interface to support state trackers that like to modify
+ * samplers one at a time:
+ */
+enum pipe_error
+cso_single_sampler(struct cso_context *cso,
+                   unsigned shader_stage,
+                   unsigned count,
+                   const struct pipe_sampler_state *states);
+
+void
+cso_single_sampler_done(struct cso_context *cso, unsigned shader_stage);
+
+
+enum pipe_error cso_set_vertex_elements(struct cso_context *ctx,
+                                        unsigned count,
+                                        const struct pipe_vertex_element *states);
+void cso_save_vertex_elements(struct cso_context *ctx);
+void cso_restore_vertex_elements(struct cso_context *ctx);
+
+
+void cso_set_vertex_buffers(struct cso_context *ctx,
+                            unsigned start_slot, unsigned count,
+                            const struct pipe_vertex_buffer *buffers);
+
+/* One vertex buffer slot is provided with the save/restore functionality.
+ * cso_context chooses the slot, it can be non-zero. */
+void cso_save_aux_vertex_buffer_slot(struct cso_context *ctx);
+void cso_restore_aux_vertex_buffer_slot(struct cso_context *ctx);
+unsigned cso_get_aux_vertex_buffer_slot(struct cso_context *ctx);
+
+
+void cso_set_stream_outputs(struct cso_context *ctx,
+                            unsigned num_targets,
+                            struct pipe_stream_output_target **targets,
+                            unsigned append_bitmask);
+void cso_save_stream_outputs(struct cso_context *ctx);
+void cso_restore_stream_outputs(struct cso_context *ctx);
+
+
+/*
+ * We don't provide shader caching in CSO.  Most of the time the api provides
+ * object semantics for shaders anyway, and the cases where it doesn't
+ * (eg mesa's internally-generated texenv programs), it will be up to
+ * the state tracker to implement their own specialized caching.
+ */
+
+void cso_set_fragment_shader_handle(struct cso_context *ctx, void *handle);
+void cso_delete_fragment_shader(struct cso_context *ctx, void *handle );
+void cso_save_fragment_shader(struct cso_context *cso);
+void cso_restore_fragment_shader(struct cso_context *cso);
+
+
+void cso_set_vertex_shader_handle(struct cso_context *ctx, void *handle);
+void cso_delete_vertex_shader(struct cso_context *ctx, void *handle );
+void cso_save_vertex_shader(struct cso_context *cso);
+void cso_restore_vertex_shader(struct cso_context *cso);
+
+
+void cso_set_geometry_shader_handle(struct cso_context *ctx, void *handle);
+void cso_delete_geometry_shader(struct cso_context *ctx, void *handle);
+void cso_save_geometry_shader(struct cso_context *cso);
+void cso_restore_geometry_shader(struct cso_context *cso);
+
+
+void cso_set_framebuffer(struct cso_context *cso,
+                         const struct pipe_framebuffer_state *fb);
+void cso_save_framebuffer(struct cso_context *cso);
+void cso_restore_framebuffer(struct cso_context *cso);
+
+
+void cso_set_viewport(struct cso_context *cso,
+                      const struct pipe_viewport_state *vp);
+void cso_save_viewport(struct cso_context *cso);
+void cso_restore_viewport(struct cso_context *cso);
+
+
+void cso_set_blend_color(struct cso_context *cso,
+                         const struct pipe_blend_color *bc);
+
+void cso_set_sample_mask(struct cso_context *cso, unsigned stencil_mask);
+void cso_save_sample_mask(struct cso_context *ctx);
+void cso_restore_sample_mask(struct cso_context *ctx);
+
+void cso_set_stencil_ref(struct cso_context *cso,
+                         const struct pipe_stencil_ref *sr);
+void cso_save_stencil_ref(struct cso_context *cso);
+void cso_restore_stencil_ref(struct cso_context *cso);
+
+void cso_set_render_condition(struct cso_context *cso,
+                              struct pipe_query *query,
+                              boolean condition, uint mode);
+void cso_save_render_condition(struct cso_context *cso);
+void cso_restore_render_condition(struct cso_context *cso);
+
+
+/* clip state */
+
+void
+cso_set_clip(struct cso_context *cso,
+             const struct pipe_clip_state *clip);
+
+void
+cso_save_clip(struct cso_context *cso);
+
+void
+cso_restore_clip(struct cso_context *cso);
+
+
+/* sampler view state */
+
+void
+cso_set_sampler_views(struct cso_context *cso,
+                      unsigned shader_stage,
+                      unsigned count,
+                      struct pipe_sampler_view **views);
+
+void
+cso_save_sampler_views(struct cso_context *cso, unsigned shader_stage);
+
+void
+cso_restore_sampler_views(struct cso_context *cso, unsigned shader_stage);
+
+
+/* constant buffers */
+
+void cso_set_constant_buffer(struct cso_context *cso, unsigned shader_stage,
+                             unsigned index, struct pipe_constant_buffer *cb);
+void cso_set_constant_buffer_resource(struct cso_context *cso,
+                                      unsigned shader_stage,
+                                      unsigned index,
+                                      struct pipe_resource *buffer);
+void cso_save_constant_buffer_slot0(struct cso_context *cso,
+                                    unsigned shader_stage);
+void cso_restore_constant_buffer_slot0(struct cso_context *cso,
+                                       unsigned shader_stage);
+
+
+/* drawing */
+
+void
+cso_set_index_buffer(struct cso_context *cso,
+                     const struct pipe_index_buffer *ib);
+
+void
+cso_draw_vbo(struct cso_context *cso,
+             const struct pipe_draw_info *info);
+
+/* helper drawing function */
+void
+cso_draw_arrays(struct cso_context *cso, uint mode, uint start, uint count);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif
diff --git a/drivers/video/Gallium/auxiliary/cso_cache/cso_hash.c b/drivers/video/Gallium/auxiliary/cso_cache/cso_hash.c
new file mode 100644
index 0000000000..288cef7b6f
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/cso_cache/cso_hash.c
@@ -0,0 +1,439 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Zack Rusin <zack@tungstengraphics.com>
+  */
+
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+
+#include "cso_hash.h"
+
+#define MAX(a, b) ((a > b) ? (a) : (b))
+
+static const int MinNumBits = 4;
+
+static const unsigned char prime_deltas[] = {
+   0,  0,  1,  3,  1,  5,  3,  3,  1,  9,  7,  5,  3,  9, 25,  3,
+   1, 21,  3, 21,  7, 15,  9,  5,  3, 29, 15,  0,  0,  0,  0,  0
+};
+
+static int primeForNumBits(int numBits)
+{
+   return (1 << numBits) + prime_deltas[numBits];
+}
+
+/*
+    Returns the smallest integer n such that
+    primeForNumBits(n) >= hint.
+*/
+static int countBits(int hint)
+{
+   int numBits = 0;
+   int bits = hint;
+
+   while (bits > 1) {
+      bits >>= 1;
+      numBits++;
+   }
+
+   if (numBits >= (int)sizeof(prime_deltas)) {
+      numBits = sizeof(prime_deltas) - 1;
+   } else if (primeForNumBits(numBits) < hint) {
+      ++numBits;
+   }
+   return numBits;
+}
+
+struct cso_node {
+   struct cso_node *next;
+   unsigned key;
+   void *value;
+};
+
+struct cso_hash_data {
+   struct cso_node *fakeNext;
+   struct cso_node **buckets;
+   int size;
+   int nodeSize;
+   short userNumBits;
+   short numBits;
+   int numBuckets;
+};
+
+struct cso_hash {
+   union {
+      struct cso_hash_data *d;
+      struct cso_node      *e;
+   } data;
+};
+
+static void *cso_data_allocate_node(struct cso_hash_data *hash)
+{
+   return MALLOC(hash->nodeSize);
+}
+
+static void cso_free_node(struct cso_node *node)
+{
+   FREE(node);
+}
+
+static struct cso_node *
+cso_hash_create_node(struct cso_hash *hash,
+                      unsigned akey, void *avalue,
+                      struct cso_node **anextNode)
+{
+   struct cso_node *node = cso_data_allocate_node(hash->data.d);
+
+   if (!node)
+      return NULL;
+
+   node->key = akey;
+   node->value = avalue;
+
+   node->next = (struct cso_node*)(*anextNode);
+   *anextNode = node;
+   ++hash->data.d->size;
+   return node;
+}
+
+static void cso_data_rehash(struct cso_hash_data *hash, int hint)
+{
+   if (hint < 0) {
+      hint = countBits(-hint);
+      if (hint < MinNumBits)
+         hint = MinNumBits;
+      hash->userNumBits = (short)hint;
+      while (primeForNumBits(hint) < (hash->size >> 1))
+         ++hint;
+   } else if (hint < MinNumBits) {
+      hint = MinNumBits;
+   }
+
+   if (hash->numBits != hint) {
+      struct cso_node *e = (struct cso_node *)(hash);
+      struct cso_node **oldBuckets = hash->buckets;
+      int oldNumBuckets = hash->numBuckets;
+      int  i = 0;
+
+      hash->numBits = (short)hint;
+      hash->numBuckets = primeForNumBits(hint);
+      hash->buckets = MALLOC(sizeof(struct cso_node*) * hash->numBuckets);
+      for (i = 0; i < hash->numBuckets; ++i)
+         hash->buckets[i] = e;
+
+      for (i = 0; i < oldNumBuckets; ++i) {
+         struct cso_node *firstNode = oldBuckets[i];
+         while (firstNode != e) {
+            unsigned h = firstNode->key;
+            struct cso_node *lastNode = firstNode;
+            struct cso_node *afterLastNode;
+            struct cso_node **beforeFirstNode;
+            
+            while (lastNode->next != e && lastNode->next->key == h)
+               lastNode = lastNode->next;
+
+            afterLastNode = lastNode->next;
+            beforeFirstNode = &hash->buckets[h % hash->numBuckets];
+            while (*beforeFirstNode != e)
+               beforeFirstNode = &(*beforeFirstNode)->next;
+            lastNode->next = *beforeFirstNode;
+            *beforeFirstNode = firstNode;
+            firstNode = afterLastNode;
+         }
+      }
+      FREE(oldBuckets);
+   }
+}
+
+static void cso_data_might_grow(struct cso_hash_data *hash)
+{
+   if (hash->size >= hash->numBuckets)
+      cso_data_rehash(hash, hash->numBits + 1);
+}
+
+static void cso_data_has_shrunk(struct cso_hash_data *hash)
+{
+   if (hash->size <= (hash->numBuckets >> 3) &&
+       hash->numBits > hash->userNumBits) {
+      int max = MAX(hash->numBits-2, hash->userNumBits);
+      cso_data_rehash(hash,  max);
+   }
+}
+
+static struct cso_node *cso_data_first_node(struct cso_hash_data *hash)
+{
+   struct cso_node *e = (struct cso_node *)(hash);
+   struct cso_node **bucket = hash->buckets;
+   int n = hash->numBuckets;
+   while (n--) {
+      if (*bucket != e)
+         return *bucket;
+      ++bucket;
+   }
+   return e;
+}
+
+static struct cso_node **cso_hash_find_node(struct cso_hash *hash, unsigned akey)
+{
+   struct cso_node **node;
+
+   if (hash->data.d->numBuckets) {
+      node = (struct cso_node **)(&hash->data.d->buckets[akey % hash->data.d->numBuckets]);
+      assert(*node == hash->data.e || (*node)->next);
+      while (*node != hash->data.e && (*node)->key != akey)
+         node = &(*node)->next;
+   } else {
+      node = (struct cso_node **)((const struct cso_node * const *)(&hash->data.e));
+   }
+   return node;
+}
+
+struct cso_hash_iter cso_hash_insert(struct cso_hash *hash,
+                                       unsigned key, void *data)
+{
+   cso_data_might_grow(hash->data.d);
+
+   {
+      struct cso_node **nextNode = cso_hash_find_node(hash, key);
+      struct cso_node *node = cso_hash_create_node(hash, key, data, nextNode);
+      if (!node) {
+         struct cso_hash_iter null_iter = {hash, 0};
+         return null_iter;
+      }
+
+      {
+         struct cso_hash_iter iter = {hash, node};
+         return iter;
+      }
+   }
+}
+
+struct cso_hash * cso_hash_create(void)
+{
+   struct cso_hash *hash = MALLOC_STRUCT(cso_hash);
+   if (!hash)
+      return NULL;
+
+   hash->data.d = MALLOC_STRUCT(cso_hash_data);
+   if (!hash->data.d) {
+      FREE(hash);
+      return NULL;
+   }
+
+   hash->data.d->fakeNext = 0;
+   hash->data.d->buckets = 0;
+   hash->data.d->size = 0;
+   hash->data.d->nodeSize = sizeof(struct cso_node);
+   hash->data.d->userNumBits = (short)MinNumBits;
+   hash->data.d->numBits = 0;
+   hash->data.d->numBuckets = 0;
+
+   return hash;
+}
+
+void cso_hash_delete(struct cso_hash *hash)
+{
+   struct cso_node *e_for_x = (struct cso_node *)(hash->data.d);
+   struct cso_node **bucket = (struct cso_node **)(hash->data.d->buckets);
+   int n = hash->data.d->numBuckets;
+   while (n--) {
+      struct cso_node *cur = *bucket++;
+      while (cur != e_for_x) {
+         struct cso_node *next = cur->next;
+         cso_free_node(cur);
+         cur = next;
+      }
+   }
+   FREE(hash->data.d->buckets);
+   FREE(hash->data.d);
+   FREE(hash);
+}
+
+struct cso_hash_iter cso_hash_find(struct cso_hash *hash,
+                                     unsigned key)
+{
+   struct cso_node **nextNode = cso_hash_find_node(hash, key);
+   struct cso_hash_iter iter = {hash, *nextNode};
+   return iter;
+}
+
+unsigned cso_hash_iter_key(struct cso_hash_iter iter)
+{
+   if (!iter.node || iter.hash->data.e == iter.node)
+      return 0;
+   return iter.node->key;
+}
+
+void * cso_hash_iter_data(struct cso_hash_iter iter)
+{
+   if (!iter.node || iter.hash->data.e == iter.node)
+      return 0;
+   return iter.node->value;
+}
+
+static struct cso_node *cso_hash_data_next(struct cso_node *node)
+{
+   union {
+      struct cso_node *next;
+      struct cso_node *e;
+      struct cso_hash_data *d;
+   } a;
+   int start;
+   struct cso_node **bucket;
+   int n;
+
+   a.next = node->next;
+   if (!a.next) {
+      debug_printf("iterating beyond the last element\n");
+      return 0;
+   }
+   if (a.next->next)
+      return a.next;
+
+   start = (node->key % a.d->numBuckets) + 1;
+   bucket = a.d->buckets + start;
+   n = a.d->numBuckets - start;
+   while (n--) {
+      if (*bucket != a.e)
+         return *bucket;
+      ++bucket;
+   }
+   return a.e;
+}
+
+
+static struct cso_node *cso_hash_data_prev(struct cso_node *node)
+{
+   union {
+      struct cso_node *e;
+      struct cso_hash_data *d;
+   } a;
+   int start;
+   struct cso_node *sentinel;
+   struct cso_node **bucket;
+
+   a.e = node;
+   while (a.e->next)
+      a.e = a.e->next;
+
+   if (node == a.e)
+      start = a.d->numBuckets - 1;
+   else
+      start = node->key % a.d->numBuckets;
+
+   sentinel = node;
+   bucket = a.d->buckets + start;
+   while (start >= 0) {
+      if (*bucket != sentinel) {
+         struct cso_node *prev = *bucket;
+         while (prev->next != sentinel)
+            prev = prev->next;
+         return prev;
+      }
+
+      sentinel = a.e;
+      --bucket;
+      --start;
+   }
+   debug_printf("iterating backward beyond first element\n");
+   return a.e;
+}
+
+struct cso_hash_iter cso_hash_iter_next(struct cso_hash_iter iter)
+{
+   struct cso_hash_iter next = {iter.hash, cso_hash_data_next(iter.node)};
+   return next;
+}
+
+int cso_hash_iter_is_null(struct cso_hash_iter iter)
+{
+   if (!iter.node || iter.node == iter.hash->data.e)
+      return 1;
+   return 0;
+}
+
+void * cso_hash_take(struct cso_hash *hash,
+                      unsigned akey)
+{
+   struct cso_node **node = cso_hash_find_node(hash, akey);
+   if (*node != hash->data.e) {
+      void *t = (*node)->value;
+      struct cso_node *next = (*node)->next;
+      cso_free_node(*node);
+      *node = next;
+      --hash->data.d->size;
+      cso_data_has_shrunk(hash->data.d);
+      return t;
+   }
+   return 0;
+}
+
+struct cso_hash_iter cso_hash_iter_prev(struct cso_hash_iter iter)
+{
+   struct cso_hash_iter prev = {iter.hash,
+                                 cso_hash_data_prev(iter.node)};
+   return prev;
+}
+
+struct cso_hash_iter cso_hash_first_node(struct cso_hash *hash)
+{
+   struct cso_hash_iter iter = {hash, cso_data_first_node(hash->data.d)};
+   return iter;
+}
+
+int cso_hash_size(struct cso_hash *hash)
+{
+   return hash->data.d->size;
+}
+
+struct cso_hash_iter cso_hash_erase(struct cso_hash *hash, struct cso_hash_iter iter)
+{
+   struct cso_hash_iter ret = iter;
+   struct cso_node *node = iter.node;
+   struct cso_node **node_ptr;
+
+   if (node == hash->data.e)
+      return iter;
+
+   ret = cso_hash_iter_next(ret);
+   node_ptr = (struct cso_node**)(&hash->data.d->buckets[node->key % hash->data.d->numBuckets]);
+   while (*node_ptr != node)
+      node_ptr = &(*node_ptr)->next;
+   *node_ptr = node->next;
+   cso_free_node(node);
+   --hash->data.d->size;
+   return ret;
+}
+
+boolean cso_hash_contains(struct cso_hash *hash, unsigned key)
+{
+   struct cso_node **node = cso_hash_find_node(hash, key);
+   return (*node != hash->data.e);
+}
diff --git a/drivers/video/Gallium/auxiliary/cso_cache/cso_hash.h b/drivers/video/Gallium/auxiliary/cso_cache/cso_hash.h
new file mode 100644
index 0000000000..5891c325fa
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/cso_cache/cso_hash.h
@@ -0,0 +1,129 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Hash table implementation.
+ * 
+ * This file provides a hash implementation that is capable of dealing
+ * with collisions. It stores colliding entries in linked list. All
+ * functions operating on the hash return an iterator. The iterator
+ * itself points to the collision list. If there wasn't any collision
+ * the list will have just one entry, otherwise client code should
+ * iterate over the entries to find the exact entry among ones that
+ * had the same key (e.g. memcmp could be used on the data to check
+ * that)
+ * 
+ * @author Zack Rusin <zack@tungstengraphics.com>
+ */
+
+#ifndef CSO_HASH_H
+#define CSO_HASH_H
+
+#include "pipe/p_compiler.h"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+
+struct cso_hash;
+struct cso_node;
+
+
+struct cso_hash_iter {
+   struct cso_hash *hash;
+   struct cso_node  *node;
+};
+
+
+struct cso_hash *cso_hash_create(void);
+void             cso_hash_delete(struct cso_hash *hash);
+
+
+int              cso_hash_size(struct cso_hash *hash);
+
+
+/**
+ * Adds a data with the given key to the hash. If entry with the given
+ * key is already in the hash, this current entry is instered before it
+ * in the collision list.
+ * Function returns iterator pointing to the inserted item in the hash.
+ */
+struct cso_hash_iter cso_hash_insert(struct cso_hash *hash, unsigned key,
+                                     void *data);
+/**
+ * Removes the item pointed to by the current iterator from the hash.
+ * Note that the data itself is not erased and if it was a malloc'ed pointer
+ * it will have to be freed after calling this function by the callee.
+ * Function returns iterator pointing to the item after the removed one in
+ * the hash.
+ */
+struct cso_hash_iter cso_hash_erase(struct cso_hash *hash, struct cso_hash_iter iter);
+
+void  *cso_hash_take(struct cso_hash *hash, unsigned key);
+
+
+
+struct cso_hash_iter cso_hash_first_node(struct cso_hash *hash);
+
+/**
+ * Return an iterator pointing to the first entry in the collision list.
+ */
+struct cso_hash_iter cso_hash_find(struct cso_hash *hash, unsigned key);
+
+/**
+ * Returns true if a value with the given key exists in the hash
+ */
+boolean   cso_hash_contains(struct cso_hash *hash, unsigned key);
+
+
+int       cso_hash_iter_is_null(struct cso_hash_iter iter);
+unsigned  cso_hash_iter_key(struct cso_hash_iter iter);
+void     *cso_hash_iter_data(struct cso_hash_iter iter);
+
+
+struct cso_hash_iter cso_hash_iter_next(struct cso_hash_iter iter);
+struct cso_hash_iter cso_hash_iter_prev(struct cso_hash_iter iter);
+
+
+/**
+ * Convenience routine to iterate over the collision list while doing a memory
+ * comparison to see which entry in the list is a direct copy of our template
+ * and returns that entry.
+ */
+void *cso_hash_find_data_from_template( struct cso_hash *hash,
+				        unsigned hash_key,
+				        void *templ,
+				        int size );
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif
diff --git a/drivers/video/Gallium/auxiliary/os/os_time.c b/drivers/video/Gallium/auxiliary/os/os_time.c
index f7e4ca49c7..abcba63fe1 100644
--- a/drivers/video/Gallium/auxiliary/os/os_time.c
+++ b/drivers/video/Gallium/auxiliary/os/os_time.c
@@ -28,21 +28,15 @@
 /**
  * @file
  * OS independent time-manipulation functions.
- * 
+ *
  * @author Jose Fonseca <jfonseca@vmware.com>
  */
 
 
 #include "pipe/p_config.h"
 
-#if defined(PIPE_OS_UNIX)
 #  include <time.h> /* timeval */
 #  include <sys/time.h> /* timeval */
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
-#  include <windows.h>
-#else
-#  error Unsupported OS
-#endif
 
 #include "os_time.h"
 
@@ -50,32 +44,8 @@
 int64_t
 os_time_get_nano(void)
 {
-#if defined(PIPE_OS_LINUX)
-
-   struct timespec tv;
-   clock_gettime(CLOCK_MONOTONIC, &tv);
-   return tv.tv_nsec + tv.tv_sec*INT64_C(1000000000);
-
-#elif defined(PIPE_OS_UNIX)
-
    struct timeval tv;
    gettimeofday(&tv, NULL);
-   return tv.tv_usec*INT64_C(1000) + tv.tv_sec*INT64_C(1000000000);
-
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
-
-   static LARGE_INTEGER frequency;
-   LARGE_INTEGER counter;
-   if(!frequency.QuadPart)
-      QueryPerformanceFrequency(&frequency);
-   QueryPerformanceCounter(&counter);
-   return counter.QuadPart*INT64_C(1000000000)/frequency.QuadPart;
-
-#else
-
-#error Unsupported OS
-
-#endif
 }
 
 
diff --git a/drivers/video/Gallium/auxiliary/pipebuffer/pb_buffer.h b/drivers/video/Gallium/auxiliary/pipebuffer/pb_buffer.h
new file mode 100644
index 0000000000..a5ec93c255
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/pipebuffer/pb_buffer.h
@@ -0,0 +1,288 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Generic code for buffers.
+ * 
+ * Behind a pipe buffle handle there can be DMA buffers, client (or user) 
+ * buffers, regular malloced buffers, etc. This file provides an abstract base 
+ * buffer handle that allows the driver to cope with all those kinds of buffers 
+ * in a more flexible way.
+ * 
+ * There is no obligation of a winsys driver to use this library. And a pipe
+ * driver should be completly agnostic about it.
+ * 
+ * \author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+#ifndef PB_BUFFER_H_
+#define PB_BUFFER_H_
+
+
+#include "pipe/p_compiler.h"
+#include "util/u_debug.h"
+#include "util/u_inlines.h"
+#include "pipe/p_defines.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+struct pb_vtbl;
+struct pb_validate;
+struct pipe_fence_handle;
+
+
+#define PB_USAGE_CPU_READ  (1 << 0)
+#define PB_USAGE_CPU_WRITE (1 << 1)
+#define PB_USAGE_GPU_READ  (1 << 2)
+#define PB_USAGE_GPU_WRITE (1 << 3)
+#define PB_USAGE_UNSYNCHRONIZED (1 << 10)
+#define PB_USAGE_DONTBLOCK (1 << 9)
+
+#define PB_USAGE_CPU_READ_WRITE \
+   ( PB_USAGE_CPU_READ | PB_USAGE_CPU_WRITE )
+#define PB_USAGE_GPU_READ_WRITE \
+   ( PB_USAGE_GPU_READ | PB_USAGE_GPU_WRITE )
+#define PB_USAGE_WRITE \
+   ( PB_USAGE_CPU_WRITE | PB_USAGE_GPU_WRITE )
+
+/**
+ * Buffer description.
+ * 
+ * Used when allocating the buffer.
+ */
+struct pb_desc
+{
+   unsigned alignment;
+   unsigned usage;
+};
+
+
+/**
+ * Size. Regular (32bit) unsigned for now.
+ */
+typedef unsigned pb_size;
+
+
+/**
+ * Base class for all pb_* buffers.
+ */
+struct pb_buffer 
+{
+   struct pipe_reference  reference;
+   unsigned               size;
+   unsigned               alignment;
+   unsigned               usage;
+
+   /**
+    * Pointer to the virtual function table.
+    *
+    * Avoid accessing this table directly. Use the inline functions below 
+    * instead to avoid mistakes. 
+    */
+   const struct pb_vtbl *vtbl;
+};
+
+
+/**
+ * Virtual function table for the buffer storage operations.
+ * 
+ * Note that creation is not done through this table.
+ */
+struct pb_vtbl
+{
+   void (*destroy)( struct pb_buffer *buf );
+
+   /** 
+    * Map the entire data store of a buffer object into the client's address.
+    * flags is bitmask of PB_USAGE_CPU_READ/WRITE. 
+    */
+   void *(*map)( struct pb_buffer *buf, 
+                 unsigned flags, void *flush_ctx );
+   
+   void (*unmap)( struct pb_buffer *buf );
+
+   enum pipe_error (*validate)( struct pb_buffer *buf, 
+                                struct pb_validate *vl,
+                                unsigned flags );
+
+   void (*fence)( struct pb_buffer *buf, 
+                  struct pipe_fence_handle *fence );
+
+   /**
+    * Get the base buffer and the offset.
+    * 
+    * A buffer can be subdivided in smaller buffers. This method should return
+    * the underlaying buffer, and the relative offset.
+    * 
+    * Buffers without an underlaying base buffer should return themselves, with 
+    * a zero offset.
+    * 
+    * Note that this will increase the reference count of the base buffer.
+    */
+   void (*get_base_buffer)( struct pb_buffer *buf,
+                            struct pb_buffer **base_buf,
+                            pb_size *offset );
+   
+};
+
+
+
+/* Accessor functions for pb->vtbl:
+ */
+static INLINE void *
+pb_map(struct pb_buffer *buf, 
+       unsigned flags, void *flush_ctx)
+{
+   assert(buf);
+   if(!buf)
+      return NULL;
+   assert(pipe_is_referenced(&buf->reference));
+   return buf->vtbl->map(buf, flags, flush_ctx);
+}
+
+
+static INLINE void 
+pb_unmap(struct pb_buffer *buf)
+{
+   assert(buf);
+   if(!buf)
+      return;
+   assert(pipe_is_referenced(&buf->reference));
+   buf->vtbl->unmap(buf);
+}
+
+
+static INLINE void
+pb_get_base_buffer( struct pb_buffer *buf,
+		    struct pb_buffer **base_buf,
+		    pb_size *offset )
+{
+   assert(buf);
+   if(!buf) {
+      base_buf = NULL;
+      offset = 0;
+      return;
+   }
+   assert(pipe_is_referenced(&buf->reference));
+   assert(buf->vtbl->get_base_buffer);
+   buf->vtbl->get_base_buffer(buf, base_buf, offset);
+   assert(*base_buf);
+   assert(*offset < (*base_buf)->size);
+}
+
+
+static INLINE enum pipe_error 
+pb_validate(struct pb_buffer *buf, struct pb_validate *vl, unsigned flags)
+{
+   assert(buf);
+   if(!buf)
+      return PIPE_ERROR;
+   assert(buf->vtbl->validate);
+   return buf->vtbl->validate(buf, vl, flags);
+}
+
+
+static INLINE void 
+pb_fence(struct pb_buffer *buf, struct pipe_fence_handle *fence)
+{
+   assert(buf);
+   if(!buf)
+      return;
+   assert(buf->vtbl->fence);
+   buf->vtbl->fence(buf, fence);
+}
+
+
+static INLINE void 
+pb_destroy(struct pb_buffer *buf)
+{
+   assert(buf);
+   if(!buf)
+      return;
+   assert(!pipe_is_referenced(&buf->reference));
+   buf->vtbl->destroy(buf);
+}
+
+static INLINE void
+pb_reference(struct pb_buffer **dst,
+             struct pb_buffer *src)
+{
+   struct pb_buffer *old = *dst;
+
+   if (pipe_reference(&(*dst)->reference, &src->reference))
+      pb_destroy( old );
+   *dst = src;
+}
+
+
+/**
+ * Utility function to check whether the provided alignment is consistent with
+ * the requested or not.
+ */
+static INLINE boolean
+pb_check_alignment(pb_size requested, pb_size provided)
+{
+   if(!requested)
+      return TRUE;
+   if(requested > provided)
+      return FALSE;
+   if(provided % requested != 0)
+      return FALSE;
+   return TRUE;
+}
+
+
+/**
+ * Utility function to check whether the provided alignment is consistent with
+ * the requested or not.
+ */
+static INLINE boolean
+pb_check_usage(unsigned requested, unsigned provided)
+{
+   return (requested & provided) == requested ? TRUE : FALSE;
+}
+
+
+/**
+ * Malloc-based buffer to store data that can't be used by the graphics 
+ * hardware.
+ */
+struct pb_buffer *
+pb_malloc_buffer_create(pb_size size, 
+                        const struct pb_desc *desc);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*PB_BUFFER_H_*/
diff --git a/drivers/video/Gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/drivers/video/Gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
new file mode 100644
index 0000000000..9e0cacecac
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -0,0 +1,1069 @@
+/**************************************************************************
+ *
+ * Copyright 2007-2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Implementation of fenced buffers.
+ *
+ * \author Jose Fonseca <jfonseca-at-vmware-dot-com>
+ * \author Thomas Hellström <thellstrom-at-vmware-dot-com>
+ */
+
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS)
+#include <unistd.h>
+#include <sched.h>
+#endif
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_defines.h"
+#include "util/u_debug.h"
+#include "os/os_thread.h"
+#include "util/u_memory.h"
+#include "util/u_double_list.h"
+
+#include "pb_buffer.h"
+#include "pb_buffer_fenced.h"
+#include "pb_bufmgr.h"
+
+
+
+/**
+ * Convenience macro (type safe).
+ */
+#define SUPER(__derived) (&(__derived)->base)
+
+
+struct fenced_manager
+{
+   struct pb_manager base;
+   struct pb_manager *provider;
+   struct pb_fence_ops *ops;
+
+   /**
+    * Maximum buffer size that can be safely allocated.
+    */
+   pb_size max_buffer_size;
+
+   /**
+    * Maximum cpu memory we can allocate before we start waiting for the
+    * GPU to idle.
+    */
+   pb_size max_cpu_total_size;
+
+   /**
+    * Following members are mutable and protected by this mutex.
+    */
+   pipe_mutex mutex;
+
+   /**
+    * Fenced buffer list.
+    *
+    * All fenced buffers are placed in this listed, ordered from the oldest
+    * fence to the newest fence.
+    */
+   struct list_head fenced;
+   pb_size num_fenced;
+
+   struct list_head unfenced;
+   pb_size num_unfenced;
+
+   /**
+    * How much temporary CPU memory is being used to hold unvalidated buffers.
+    */
+   pb_size cpu_total_size;
+};
+
+
+/**
+ * Fenced buffer.
+ *
+ * Wrapper around a pipe buffer which adds fencing and reference counting.
+ */
+struct fenced_buffer
+{
+   /*
+    * Immutable members.
+    */
+
+   struct pb_buffer base;
+   struct fenced_manager *mgr;
+
+   /*
+    * Following members are mutable and protected by fenced_manager::mutex.
+    */
+
+   struct list_head head;
+
+   /**
+    * Buffer with storage.
+    */
+   struct pb_buffer *buffer;
+   pb_size size;
+   struct pb_desc desc;
+
+   /**
+    * Temporary CPU storage data. Used when there isn't enough GPU memory to
+    * store the buffer.
+    */
+   void *data;
+
+   /**
+    * A bitmask of PB_USAGE_CPU/GPU_READ/WRITE describing the current
+    * buffer usage.
+    */
+   unsigned flags;
+
+   unsigned mapcount;
+
+   struct pb_validate *vl;
+   unsigned validation_flags;
+
+   struct pipe_fence_handle *fence;
+};
+
+
+static INLINE struct fenced_manager *
+fenced_manager(struct pb_manager *mgr)
+{
+   assert(mgr);
+   return (struct fenced_manager *)mgr;
+}
+
+
+static INLINE struct fenced_buffer *
+fenced_buffer(struct pb_buffer *buf)
+{
+   assert(buf);
+   return (struct fenced_buffer *)buf;
+}
+
+
+static void
+fenced_buffer_destroy_cpu_storage_locked(struct fenced_buffer *fenced_buf);
+
+static enum pipe_error
+fenced_buffer_create_cpu_storage_locked(struct fenced_manager *fenced_mgr,
+                                        struct fenced_buffer *fenced_buf);
+
+static void
+fenced_buffer_destroy_gpu_storage_locked(struct fenced_buffer *fenced_buf);
+
+static enum pipe_error
+fenced_buffer_create_gpu_storage_locked(struct fenced_manager *fenced_mgr,
+                                        struct fenced_buffer *fenced_buf,
+                                        boolean wait);
+
+static enum pipe_error
+fenced_buffer_copy_storage_to_gpu_locked(struct fenced_buffer *fenced_buf);
+
+static enum pipe_error
+fenced_buffer_copy_storage_to_cpu_locked(struct fenced_buffer *fenced_buf);
+
+
+/**
+ * Dump the fenced buffer list.
+ *
+ * Useful to understand failures to allocate buffers.
+ */
+static void
+fenced_manager_dump_locked(struct fenced_manager *fenced_mgr)
+{
+#ifdef DEBUG
+   struct pb_fence_ops *ops = fenced_mgr->ops;
+   struct list_head *curr, *next;
+   struct fenced_buffer *fenced_buf;
+
+   debug_printf("%10s %7s %8s %7s %10s %s\n",
+                "buffer", "size", "refcount", "storage", "fence", "signalled");
+
+   curr = fenced_mgr->unfenced.next;
+   next = curr->next;
+   while(curr != &fenced_mgr->unfenced) {
+      fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
+      assert(!fenced_buf->fence);
+      debug_printf("%10p %7u %8u %7s\n",
+                   (void *) fenced_buf,
+                   fenced_buf->base.size,
+                   p_atomic_read(&fenced_buf->base.reference.count),
+                   fenced_buf->buffer ? "gpu" : (fenced_buf->data ? "cpu" : "none"));
+      curr = next;
+      next = curr->next;
+   }
+
+   curr = fenced_mgr->fenced.next;
+   next = curr->next;
+   while(curr != &fenced_mgr->fenced) {
+      int signaled;
+      fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
+      assert(fenced_buf->buffer);
+      signaled = ops->fence_signalled(ops, fenced_buf->fence, 0);
+      debug_printf("%10p %7u %8u %7s %10p %s\n",
+                   (void *) fenced_buf,
+                   fenced_buf->base.size,
+                   p_atomic_read(&fenced_buf->base.reference.count),
+                   "gpu",
+                   (void *) fenced_buf->fence,
+                   signaled == 0 ? "y" : "n");
+      curr = next;
+      next = curr->next;
+   }
+#else
+   (void)fenced_mgr;
+#endif
+}
+
+
+static INLINE void
+fenced_buffer_destroy_locked(struct fenced_manager *fenced_mgr,
+                             struct fenced_buffer *fenced_buf)
+{
+   assert(!pipe_is_referenced(&fenced_buf->base.reference));
+
+   assert(!fenced_buf->fence);
+   assert(fenced_buf->head.prev);
+   assert(fenced_buf->head.next);
+   LIST_DEL(&fenced_buf->head);
+   assert(fenced_mgr->num_unfenced);
+   --fenced_mgr->num_unfenced;
+
+   fenced_buffer_destroy_gpu_storage_locked(fenced_buf);
+   fenced_buffer_destroy_cpu_storage_locked(fenced_buf);
+
+   FREE(fenced_buf);
+}
+
+
+/**
+ * Add the buffer to the fenced list.
+ *
+ * Reference count should be incremented before calling this function.
+ */
+static INLINE void
+fenced_buffer_add_locked(struct fenced_manager *fenced_mgr,
+                         struct fenced_buffer *fenced_buf)
+{
+   assert(pipe_is_referenced(&fenced_buf->base.reference));
+   assert(fenced_buf->flags & PB_USAGE_GPU_READ_WRITE);
+   assert(fenced_buf->fence);
+
+   p_atomic_inc(&fenced_buf->base.reference.count);
+
+   LIST_DEL(&fenced_buf->head);
+   assert(fenced_mgr->num_unfenced);
+   --fenced_mgr->num_unfenced;
+   LIST_ADDTAIL(&fenced_buf->head, &fenced_mgr->fenced);
+   ++fenced_mgr->num_fenced;
+}
+
+
+/**
+ * Remove the buffer from the fenced list, and potentially destroy the buffer
+ * if the reference count reaches zero.
+ *
+ * Returns TRUE if the buffer was detroyed.
+ */
+static INLINE boolean
+fenced_buffer_remove_locked(struct fenced_manager *fenced_mgr,
+                            struct fenced_buffer *fenced_buf)
+{
+   struct pb_fence_ops *ops = fenced_mgr->ops;
+
+   assert(fenced_buf->fence);
+   assert(fenced_buf->mgr == fenced_mgr);
+
+   ops->fence_reference(ops, &fenced_buf->fence, NULL);
+   fenced_buf->flags &= ~PB_USAGE_GPU_READ_WRITE;
+
+   assert(fenced_buf->head.prev);
+   assert(fenced_buf->head.next);
+
+   LIST_DEL(&fenced_buf->head);
+   assert(fenced_mgr->num_fenced);
+   --fenced_mgr->num_fenced;
+
+   LIST_ADDTAIL(&fenced_buf->head, &fenced_mgr->unfenced);
+   ++fenced_mgr->num_unfenced;
+
+   if (p_atomic_dec_zero(&fenced_buf->base.reference.count)) {
+      fenced_buffer_destroy_locked(fenced_mgr, fenced_buf);
+      return TRUE;
+   }
+
+   return FALSE;
+}
+
+
+/**
+ * Wait for the fence to expire, and remove it from the fenced list.
+ *
+ * This function will release and re-aquire the mutex, so any copy of mutable
+ * state must be discarded after calling it.
+ */
+static INLINE enum pipe_error
+fenced_buffer_finish_locked(struct fenced_manager *fenced_mgr,
+                            struct fenced_buffer *fenced_buf)
+{
+   struct pb_fence_ops *ops = fenced_mgr->ops;
+   enum pipe_error ret = PIPE_ERROR;
+
+#if 0
+   debug_warning("waiting for GPU");
+#endif
+
+   assert(pipe_is_referenced(&fenced_buf->base.reference));
+   assert(fenced_buf->fence);
+
+   if(fenced_buf->fence) {
+      struct pipe_fence_handle *fence = NULL;
+      int finished;
+      boolean proceed;
+
+      ops->fence_reference(ops, &fence, fenced_buf->fence);
+
+      pipe_mutex_unlock(fenced_mgr->mutex);
+
+      finished = ops->fence_finish(ops, fenced_buf->fence, 0);
+
+      pipe_mutex_lock(fenced_mgr->mutex);
+
+      assert(pipe_is_referenced(&fenced_buf->base.reference));
+
+      /*
+       * Only proceed if the fence object didn't change in the meanwhile.
+       * Otherwise assume the work has been already carried out by another
+       * thread that re-aquired the lock before us.
+       */
+      proceed = fence == fenced_buf->fence ? TRUE : FALSE;
+
+      ops->fence_reference(ops, &fence, NULL);
+
+      if(proceed && finished == 0) {
+         /*
+          * Remove from the fenced list
+          */
+
+         boolean destroyed;
+
+         destroyed = fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
+
+         /* TODO: remove consequents buffers with the same fence? */
+
+         assert(!destroyed);
+
+         fenced_buf->flags &= ~PB_USAGE_GPU_READ_WRITE;
+
+         ret = PIPE_OK;
+      }
+   }
+
+   return ret;
+}
+
+
+/**
+ * Remove as many fenced buffers from the fenced list as possible.
+ *
+ * Returns TRUE if at least one buffer was removed.
+ */
+static boolean
+fenced_manager_check_signalled_locked(struct fenced_manager *fenced_mgr,
+                                      boolean wait)
+{
+   struct pb_fence_ops *ops = fenced_mgr->ops;
+   struct list_head *curr, *next;
+   struct fenced_buffer *fenced_buf;
+   struct pipe_fence_handle *prev_fence = NULL;
+   boolean ret = FALSE;
+
+   curr = fenced_mgr->fenced.next;
+   next = curr->next;
+   while(curr != &fenced_mgr->fenced) {
+      fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
+
+      if(fenced_buf->fence != prev_fence) {
+	 int signaled;
+
+	 if (wait) {
+	    signaled = ops->fence_finish(ops, fenced_buf->fence, 0);
+
+	    /*
+	     * Don't return just now. Instead preemptively check if the
+	     * following buffers' fences already expired, without further waits.
+	     */
+	    wait = FALSE;
+	 }
+	 else {
+	    signaled = ops->fence_signalled(ops, fenced_buf->fence, 0);
+	 }
+
+	 if (signaled != 0) {
+	    return ret;
+         }
+
+	 prev_fence = fenced_buf->fence;
+      }
+      else {
+         /* This buffer's fence object is identical to the previous buffer's
+          * fence object, so no need to check the fence again.
+          */
+	 assert(ops->fence_signalled(ops, fenced_buf->fence, 0) == 0);
+      }
+
+      fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
+
+      ret = TRUE;
+
+      curr = next;
+      next = curr->next;
+   }
+
+   return ret;
+}
+
+
+/**
+ * Try to free some GPU memory by backing it up into CPU memory.
+ *
+ * Returns TRUE if at least one buffer was freed.
+ */
+static boolean
+fenced_manager_free_gpu_storage_locked(struct fenced_manager *fenced_mgr)
+{
+   struct list_head *curr, *next;
+   struct fenced_buffer *fenced_buf;
+
+   curr = fenced_mgr->unfenced.next;
+   next = curr->next;
+   while(curr != &fenced_mgr->unfenced) {
+      fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
+
+      /*
+       * We can only move storage if the buffer is not mapped and not
+       * validated.
+       */
+      if(fenced_buf->buffer &&
+         !fenced_buf->mapcount &&
+         !fenced_buf->vl) {
+         enum pipe_error ret;
+
+         ret = fenced_buffer_create_cpu_storage_locked(fenced_mgr, fenced_buf);
+         if(ret == PIPE_OK) {
+            ret = fenced_buffer_copy_storage_to_cpu_locked(fenced_buf);
+            if(ret == PIPE_OK) {
+               fenced_buffer_destroy_gpu_storage_locked(fenced_buf);
+               return TRUE;
+            }
+            fenced_buffer_destroy_cpu_storage_locked(fenced_buf);
+         }
+      }
+
+      curr = next;
+      next = curr->next;
+   }
+
+   return FALSE;
+}
+
+
+/**
+ * Destroy CPU storage for this buffer.
+ */
+static void
+fenced_buffer_destroy_cpu_storage_locked(struct fenced_buffer *fenced_buf)
+{
+   if(fenced_buf->data) {
+      align_free(fenced_buf->data);
+      fenced_buf->data = NULL;
+      assert(fenced_buf->mgr->cpu_total_size >= fenced_buf->size);
+      fenced_buf->mgr->cpu_total_size -= fenced_buf->size;
+   }
+}
+
+
+/**
+ * Create CPU storage for this buffer.
+ */
+static enum pipe_error
+fenced_buffer_create_cpu_storage_locked(struct fenced_manager *fenced_mgr,
+                                        struct fenced_buffer *fenced_buf)
+{
+   assert(!fenced_buf->data);
+   if(fenced_buf->data)
+      return PIPE_OK;
+
+   if (fenced_mgr->cpu_total_size + fenced_buf->size > fenced_mgr->max_cpu_total_size)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   fenced_buf->data = align_malloc(fenced_buf->size, fenced_buf->desc.alignment);
+   if(!fenced_buf->data)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   fenced_mgr->cpu_total_size += fenced_buf->size;
+
+   return PIPE_OK;
+}
+
+
+/**
+ * Destroy the GPU storage.
+ */
+static void
+fenced_buffer_destroy_gpu_storage_locked(struct fenced_buffer *fenced_buf)
+{
+   if(fenced_buf->buffer) {
+      pb_reference(&fenced_buf->buffer, NULL);
+   }
+}
+
+
+/**
+ * Try to create GPU storage for this buffer.
+ *
+ * This function is a shorthand around pb_manager::create_buffer for
+ * fenced_buffer_create_gpu_storage_locked()'s benefit.
+ */
+static INLINE boolean
+fenced_buffer_try_create_gpu_storage_locked(struct fenced_manager *fenced_mgr,
+                                            struct fenced_buffer *fenced_buf)
+{
+   struct pb_manager *provider = fenced_mgr->provider;
+
+   assert(!fenced_buf->buffer);
+
+   fenced_buf->buffer = provider->create_buffer(fenced_mgr->provider,
+                                                fenced_buf->size,
+                                                &fenced_buf->desc);
+   return fenced_buf->buffer ? TRUE : FALSE;
+}
+
+
+/**
+ * Create GPU storage for this buffer.
+ */
+static enum pipe_error
+fenced_buffer_create_gpu_storage_locked(struct fenced_manager *fenced_mgr,
+                                        struct fenced_buffer *fenced_buf,
+                                        boolean wait)
+{
+   assert(!fenced_buf->buffer);
+
+   /*
+    * Check for signaled buffers before trying to allocate.
+    */
+   fenced_manager_check_signalled_locked(fenced_mgr, FALSE);
+
+   fenced_buffer_try_create_gpu_storage_locked(fenced_mgr, fenced_buf);
+
+   /*
+    * Keep trying while there is some sort of progress:
+    * - fences are expiring,
+    * - or buffers are being being swapped out from GPU memory into CPU memory.
+    */
+   while(!fenced_buf->buffer &&
+         (fenced_manager_check_signalled_locked(fenced_mgr, FALSE) ||
+          fenced_manager_free_gpu_storage_locked(fenced_mgr))) {
+      fenced_buffer_try_create_gpu_storage_locked(fenced_mgr, fenced_buf);
+   }
+
+   if(!fenced_buf->buffer && wait) {
+      /*
+       * Same as before, but this time around, wait to free buffers if
+       * necessary.
+       */
+      while(!fenced_buf->buffer &&
+            (fenced_manager_check_signalled_locked(fenced_mgr, TRUE) ||
+             fenced_manager_free_gpu_storage_locked(fenced_mgr))) {
+         fenced_buffer_try_create_gpu_storage_locked(fenced_mgr, fenced_buf);
+      }
+   }
+
+   if(!fenced_buf->buffer) {
+      if(0)
+         fenced_manager_dump_locked(fenced_mgr);
+
+      /* give up */
+      return PIPE_ERROR_OUT_OF_MEMORY;
+   }
+
+   return PIPE_OK;
+}
+
+
+static enum pipe_error
+fenced_buffer_copy_storage_to_gpu_locked(struct fenced_buffer *fenced_buf)
+{
+   uint8_t *map;
+
+   assert(fenced_buf->data);
+   assert(fenced_buf->buffer);
+
+   map = pb_map(fenced_buf->buffer, PB_USAGE_CPU_WRITE, NULL);
+   if(!map)
+      return PIPE_ERROR;
+
+   memcpy(map, fenced_buf->data, fenced_buf->size);
+
+   pb_unmap(fenced_buf->buffer);
+
+   return PIPE_OK;
+}
+
+
+static enum pipe_error
+fenced_buffer_copy_storage_to_cpu_locked(struct fenced_buffer *fenced_buf)
+{
+   const uint8_t *map;
+
+   assert(fenced_buf->data);
+   assert(fenced_buf->buffer);
+
+   map = pb_map(fenced_buf->buffer, PB_USAGE_CPU_READ, NULL);
+   if(!map)
+      return PIPE_ERROR;
+
+   memcpy(fenced_buf->data, map, fenced_buf->size);
+
+   pb_unmap(fenced_buf->buffer);
+
+   return PIPE_OK;
+}
+
+
+static void
+fenced_buffer_destroy(struct pb_buffer *buf)
+{
+   struct fenced_buffer *fenced_buf = fenced_buffer(buf);
+   struct fenced_manager *fenced_mgr = fenced_buf->mgr;
+
+   assert(!pipe_is_referenced(&fenced_buf->base.reference));
+
+   pipe_mutex_lock(fenced_mgr->mutex);
+
+   fenced_buffer_destroy_locked(fenced_mgr, fenced_buf);
+
+   pipe_mutex_unlock(fenced_mgr->mutex);
+}
+
+
+static void *
+fenced_buffer_map(struct pb_buffer *buf,
+                  unsigned flags, void *flush_ctx)
+{
+   struct fenced_buffer *fenced_buf = fenced_buffer(buf);
+   struct fenced_manager *fenced_mgr = fenced_buf->mgr;
+   struct pb_fence_ops *ops = fenced_mgr->ops;
+   void *map = NULL;
+
+   pipe_mutex_lock(fenced_mgr->mutex);
+
+   assert(!(flags & PB_USAGE_GPU_READ_WRITE));
+
+   /*
+    * Serialize writes.
+    */
+   while((fenced_buf->flags & PB_USAGE_GPU_WRITE) ||
+         ((fenced_buf->flags & PB_USAGE_GPU_READ) &&
+          (flags & PB_USAGE_CPU_WRITE))) {
+
+      /* 
+       * Don't wait for the GPU to finish accessing it, if blocking is forbidden.
+       */
+      if((flags & PB_USAGE_DONTBLOCK) &&
+          ops->fence_signalled(ops, fenced_buf->fence, 0) != 0) {
+         goto done;
+      }
+
+      if (flags & PB_USAGE_UNSYNCHRONIZED) {
+         break;
+      }
+
+      /*
+       * Wait for the GPU to finish accessing. This will release and re-acquire
+       * the mutex, so all copies of mutable state must be discarded.
+       */
+      fenced_buffer_finish_locked(fenced_mgr, fenced_buf);
+   }
+
+   if(fenced_buf->buffer) {
+      map = pb_map(fenced_buf->buffer, flags, flush_ctx);
+   }
+   else {
+      assert(fenced_buf->data);
+      map = fenced_buf->data;
+   }
+
+   if(map) {
+      ++fenced_buf->mapcount;
+      fenced_buf->flags |= flags & PB_USAGE_CPU_READ_WRITE;
+   }
+
+done:
+   pipe_mutex_unlock(fenced_mgr->mutex);
+
+   return map;
+}
+
+
+static void
+fenced_buffer_unmap(struct pb_buffer *buf)
+{
+   struct fenced_buffer *fenced_buf = fenced_buffer(buf);
+   struct fenced_manager *fenced_mgr = fenced_buf->mgr;
+
+   pipe_mutex_lock(fenced_mgr->mutex);
+
+   assert(fenced_buf->mapcount);
+   if(fenced_buf->mapcount) {
+      if (fenced_buf->buffer)
+         pb_unmap(fenced_buf->buffer);
+      --fenced_buf->mapcount;
+      if(!fenced_buf->mapcount)
+	 fenced_buf->flags &= ~PB_USAGE_CPU_READ_WRITE;
+   }
+
+   pipe_mutex_unlock(fenced_mgr->mutex);
+}
+
+
+static enum pipe_error
+fenced_buffer_validate(struct pb_buffer *buf,
+                       struct pb_validate *vl,
+                       unsigned flags)
+{
+   struct fenced_buffer *fenced_buf = fenced_buffer(buf);
+   struct fenced_manager *fenced_mgr = fenced_buf->mgr;
+   enum pipe_error ret;
+
+   pipe_mutex_lock(fenced_mgr->mutex);
+
+   if(!vl) {
+      /* invalidate */
+      fenced_buf->vl = NULL;
+      fenced_buf->validation_flags = 0;
+      ret = PIPE_OK;
+      goto done;
+   }
+
+   assert(flags & PB_USAGE_GPU_READ_WRITE);
+   assert(!(flags & ~PB_USAGE_GPU_READ_WRITE));
+   flags &= PB_USAGE_GPU_READ_WRITE;
+
+   /* Buffer cannot be validated in two different lists */
+   if(fenced_buf->vl && fenced_buf->vl != vl) {
+      ret = PIPE_ERROR_RETRY;
+      goto done;
+   }
+
+   if(fenced_buf->vl == vl &&
+      (fenced_buf->validation_flags & flags) == flags) {
+      /* Nothing to do -- buffer already validated */
+      ret = PIPE_OK;
+      goto done;
+   }
+
+   /*
+    * Create and update GPU storage.
+    */
+   if(!fenced_buf->buffer) {
+      assert(!fenced_buf->mapcount);
+
+      ret = fenced_buffer_create_gpu_storage_locked(fenced_mgr, fenced_buf, TRUE);
+      if(ret != PIPE_OK) {
+         goto done;
+      }
+
+      ret = fenced_buffer_copy_storage_to_gpu_locked(fenced_buf);
+      if(ret != PIPE_OK) {
+         fenced_buffer_destroy_gpu_storage_locked(fenced_buf);
+         goto done;
+      }
+
+      if(fenced_buf->mapcount) {
+         debug_printf("warning: validating a buffer while it is still mapped\n");
+      }
+      else {
+         fenced_buffer_destroy_cpu_storage_locked(fenced_buf);
+      }
+   }
+
+   ret = pb_validate(fenced_buf->buffer, vl, flags);
+   if (ret != PIPE_OK)
+      goto done;
+
+   fenced_buf->vl = vl;
+   fenced_buf->validation_flags |= flags;
+
+done:
+   pipe_mutex_unlock(fenced_mgr->mutex);
+
+   return ret;
+}
+
+
+static void
+fenced_buffer_fence(struct pb_buffer *buf,
+                    struct pipe_fence_handle *fence)
+{
+   struct fenced_buffer *fenced_buf = fenced_buffer(buf);
+   struct fenced_manager *fenced_mgr = fenced_buf->mgr;
+   struct pb_fence_ops *ops = fenced_mgr->ops;
+
+   pipe_mutex_lock(fenced_mgr->mutex);
+
+   assert(pipe_is_referenced(&fenced_buf->base.reference));
+   assert(fenced_buf->buffer);
+
+   if(fence != fenced_buf->fence) {
+      assert(fenced_buf->vl);
+      assert(fenced_buf->validation_flags);
+
+      if (fenced_buf->fence) {
+         boolean destroyed;
+         destroyed = fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
+         assert(!destroyed);
+      }
+      if (fence) {
+         ops->fence_reference(ops, &fenced_buf->fence, fence);
+         fenced_buf->flags |= fenced_buf->validation_flags;
+         fenced_buffer_add_locked(fenced_mgr, fenced_buf);
+      }
+
+      pb_fence(fenced_buf->buffer, fence);
+
+      fenced_buf->vl = NULL;
+      fenced_buf->validation_flags = 0;
+   }
+
+   pipe_mutex_unlock(fenced_mgr->mutex);
+}
+
+
+static void
+fenced_buffer_get_base_buffer(struct pb_buffer *buf,
+                              struct pb_buffer **base_buf,
+                              pb_size *offset)
+{
+   struct fenced_buffer *fenced_buf = fenced_buffer(buf);
+   struct fenced_manager *fenced_mgr = fenced_buf->mgr;
+
+   pipe_mutex_lock(fenced_mgr->mutex);
+
+   /*
+    * This should only be called when the buffer is validated. Typically
+    * when processing relocations.
+    */
+   assert(fenced_buf->vl);
+   assert(fenced_buf->buffer);
+
+   if(fenced_buf->buffer)
+      pb_get_base_buffer(fenced_buf->buffer, base_buf, offset);
+   else {
+      *base_buf = buf;
+      *offset = 0;
+   }
+
+   pipe_mutex_unlock(fenced_mgr->mutex);
+}
+
+
+static const struct pb_vtbl
+fenced_buffer_vtbl = {
+      fenced_buffer_destroy,
+      fenced_buffer_map,
+      fenced_buffer_unmap,
+      fenced_buffer_validate,
+      fenced_buffer_fence,
+      fenced_buffer_get_base_buffer
+};
+
+
+/**
+ * Wrap a buffer in a fenced buffer.
+ */
+static struct pb_buffer *
+fenced_bufmgr_create_buffer(struct pb_manager *mgr,
+                            pb_size size,
+                            const struct pb_desc *desc)
+{
+   struct fenced_manager *fenced_mgr = fenced_manager(mgr);
+   struct fenced_buffer *fenced_buf;
+   enum pipe_error ret;
+
+   /*
+    * Don't stall the GPU, waste time evicting buffers, or waste memory
+    * trying to create a buffer that will most likely never fit into the
+    * graphics aperture.
+    */
+   if(size > fenced_mgr->max_buffer_size) {
+      goto no_buffer;
+   }
+
+   fenced_buf = CALLOC_STRUCT(fenced_buffer);
+   if(!fenced_buf)
+      goto no_buffer;
+
+   pipe_reference_init(&fenced_buf->base.reference, 1);
+   fenced_buf->base.alignment = desc->alignment;
+   fenced_buf->base.usage = desc->usage;
+   fenced_buf->base.size = size;
+   fenced_buf->size = size;
+   fenced_buf->desc = *desc;
+
+   fenced_buf->base.vtbl = &fenced_buffer_vtbl;
+   fenced_buf->mgr = fenced_mgr;
+
+   pipe_mutex_lock(fenced_mgr->mutex);
+
+   /*
+    * Try to create GPU storage without stalling,
+    */
+   ret = fenced_buffer_create_gpu_storage_locked(fenced_mgr, fenced_buf, FALSE);
+
+   /*
+    * Attempt to use CPU memory to avoid stalling the GPU.
+    */
+   if(ret != PIPE_OK) {
+      ret = fenced_buffer_create_cpu_storage_locked(fenced_mgr, fenced_buf);
+   }
+
+   /*
+    * Create GPU storage, waiting for some to be available.
+    */
+   if(ret != PIPE_OK) {
+      ret = fenced_buffer_create_gpu_storage_locked(fenced_mgr, fenced_buf, TRUE);
+   }
+
+   /*
+    * Give up.
+    */
+   if(ret != PIPE_OK) {
+      goto no_storage;
+   }
+
+   assert(fenced_buf->buffer || fenced_buf->data);
+
+   LIST_ADDTAIL(&fenced_buf->head, &fenced_mgr->unfenced);
+   ++fenced_mgr->num_unfenced;
+   pipe_mutex_unlock(fenced_mgr->mutex);
+
+   return &fenced_buf->base;
+
+no_storage:
+   pipe_mutex_unlock(fenced_mgr->mutex);
+   FREE(fenced_buf);
+no_buffer:
+   return NULL;
+}
+
+
+static void
+fenced_bufmgr_flush(struct pb_manager *mgr)
+{
+   struct fenced_manager *fenced_mgr = fenced_manager(mgr);
+
+   pipe_mutex_lock(fenced_mgr->mutex);
+   while(fenced_manager_check_signalled_locked(fenced_mgr, TRUE))
+      ;
+   pipe_mutex_unlock(fenced_mgr->mutex);
+
+   assert(fenced_mgr->provider->flush);
+   if(fenced_mgr->provider->flush)
+      fenced_mgr->provider->flush(fenced_mgr->provider);
+}
+
+
+static void
+fenced_bufmgr_destroy(struct pb_manager *mgr)
+{
+   struct fenced_manager *fenced_mgr = fenced_manager(mgr);
+
+   pipe_mutex_lock(fenced_mgr->mutex);
+
+   /* Wait on outstanding fences */
+   while (fenced_mgr->num_fenced) {
+      pipe_mutex_unlock(fenced_mgr->mutex);
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS)
+      sched_yield();
+#endif
+      pipe_mutex_lock(fenced_mgr->mutex);
+      while(fenced_manager_check_signalled_locked(fenced_mgr, TRUE))
+         ;
+   }
+
+#ifdef DEBUG
+   /*assert(!fenced_mgr->num_unfenced);*/
+#endif
+
+   pipe_mutex_unlock(fenced_mgr->mutex);
+   pipe_mutex_destroy(fenced_mgr->mutex);
+
+   if(fenced_mgr->provider)
+      fenced_mgr->provider->destroy(fenced_mgr->provider);
+
+   fenced_mgr->ops->destroy(fenced_mgr->ops);
+
+   FREE(fenced_mgr);
+}
+
+
+struct pb_manager *
+fenced_bufmgr_create(struct pb_manager *provider,
+                     struct pb_fence_ops *ops,
+                     pb_size max_buffer_size,
+                     pb_size max_cpu_total_size)
+{
+   struct fenced_manager *fenced_mgr;
+
+   if(!provider)
+      return NULL;
+
+   fenced_mgr = CALLOC_STRUCT(fenced_manager);
+   if (!fenced_mgr)
+      return NULL;
+
+   fenced_mgr->base.destroy = fenced_bufmgr_destroy;
+   fenced_mgr->base.create_buffer = fenced_bufmgr_create_buffer;
+   fenced_mgr->base.flush = fenced_bufmgr_flush;
+
+   fenced_mgr->provider = provider;
+   fenced_mgr->ops = ops;
+   fenced_mgr->max_buffer_size = max_buffer_size;
+   fenced_mgr->max_cpu_total_size = max_cpu_total_size;
+
+   LIST_INITHEAD(&fenced_mgr->fenced);
+   fenced_mgr->num_fenced = 0;
+
+   LIST_INITHEAD(&fenced_mgr->unfenced);
+   fenced_mgr->num_unfenced = 0;
+
+   pipe_mutex_init(fenced_mgr->mutex);
+
+   return &fenced_mgr->base;
+}
diff --git a/drivers/video/Gallium/auxiliary/pipebuffer/pb_buffer_fenced.h b/drivers/video/Gallium/auxiliary/pipebuffer/pb_buffer_fenced.h
new file mode 100644
index 0000000000..004c2b939a
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/pipebuffer/pb_buffer_fenced.h
@@ -0,0 +1,104 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Buffer fencing.
+ * 
+ * "Fenced buffers" is actually a misnomer. They should be referred as 
+ * "fenceable buffers", i.e, buffers that can be fenced, but I couldn't find
+ * the word "fenceable" in the dictionary.
+ * 
+ * A "fenced buffer" is a decorator around a normal buffer, which adds two 
+ * special properties:
+ * - the ability for the destruction to be delayed by a fence;
+ * - reference counting.
+ * 
+ * Usually DMA buffers have a life-time that will extend the life-time of its 
+ * handle. The end-of-life is dictated by the fence signalling. 
+ * 
+ * Between the handle's destruction, and the fence signalling, the buffer is 
+ * stored in a fenced buffer list.
+ * 
+ * \author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+#ifndef PB_BUFFER_FENCED_H_
+#define PB_BUFFER_FENCED_H_
+
+
+#include "util/u_debug.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+struct pipe_fence_handle;
+
+
+/**
+ * List of buffers which are awaiting fence signalling.
+ */
+struct fenced_buffer_list;
+
+
+struct pb_fence_ops
+{
+   void (*destroy)( struct pb_fence_ops *ops );
+
+   /** Set ptr = fence, with reference counting */
+   void (*fence_reference)( struct pb_fence_ops *ops,
+                            struct pipe_fence_handle **ptr,
+                            struct pipe_fence_handle *fence );
+
+   /**
+    * Checks whether the fence has been signalled.
+    * \param flags  driver-specific meaning
+    * \return zero on success.
+    */
+   int (*fence_signalled)( struct pb_fence_ops *ops,
+                           struct pipe_fence_handle *fence,
+                           unsigned flag );
+
+   /**
+    * Wait for the fence to finish.
+    * \param flags  driver-specific meaning
+    * \return zero on success.
+    */
+   int (*fence_finish)( struct pb_fence_ops *ops,
+                        struct pipe_fence_handle *fence,
+                        unsigned flag );
+};
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*PB_BUFFER_FENCED_H_*/
diff --git a/drivers/video/Gallium/auxiliary/pipebuffer/pb_buffer_malloc.c b/drivers/video/Gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
new file mode 100644
index 0000000000..b106a1a027
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
@@ -0,0 +1,198 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Implementation of malloc-based buffers to store data that can't be processed
+ * by the hardware. 
+ * 
+ * \author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+#include "pb_buffer.h"
+#include "pb_bufmgr.h"
+
+
+struct malloc_buffer 
+{
+   struct pb_buffer base;
+   void *data;
+};
+
+
+extern const struct pb_vtbl malloc_buffer_vtbl;
+
+static INLINE struct malloc_buffer *
+malloc_buffer(struct pb_buffer *buf)
+{
+   assert(buf);
+   if (!buf)
+      return NULL;
+   assert(buf->vtbl == &malloc_buffer_vtbl);
+   return (struct malloc_buffer *)buf;
+}
+
+
+static void
+malloc_buffer_destroy(struct pb_buffer *buf)
+{
+   align_free(malloc_buffer(buf)->data);
+   FREE(buf);
+}
+
+
+static void *
+malloc_buffer_map(struct pb_buffer *buf, 
+                  unsigned flags,
+		  void *flush_ctx)
+{
+   return malloc_buffer(buf)->data;
+}
+
+
+static void
+malloc_buffer_unmap(struct pb_buffer *buf)
+{
+   /* No-op */
+}
+
+
+static enum pipe_error 
+malloc_buffer_validate(struct pb_buffer *buf, 
+                       struct pb_validate *vl,
+                       unsigned flags)
+{
+   assert(0);
+   return PIPE_ERROR;
+}
+
+
+static void
+malloc_buffer_fence(struct pb_buffer *buf, 
+                    struct pipe_fence_handle *fence)
+{
+   assert(0);
+}
+
+
+static void
+malloc_buffer_get_base_buffer(struct pb_buffer *buf,
+                              struct pb_buffer **base_buf,
+                              pb_size *offset)
+{
+   *base_buf = buf;
+   *offset = 0;
+}
+
+
+const struct pb_vtbl 
+malloc_buffer_vtbl = {
+      malloc_buffer_destroy,
+      malloc_buffer_map,
+      malloc_buffer_unmap,
+      malloc_buffer_validate,
+      malloc_buffer_fence,
+      malloc_buffer_get_base_buffer
+};
+
+
+struct pb_buffer *
+pb_malloc_buffer_create(pb_size size,
+                   	const struct pb_desc *desc) 
+{
+   struct malloc_buffer *buf;
+   
+   /* TODO: do a single allocation */
+   
+   buf = CALLOC_STRUCT(malloc_buffer);
+   if(!buf)
+      return NULL;
+
+   pipe_reference_init(&buf->base.reference, 1);
+   buf->base.usage = desc->usage;
+   buf->base.size = size;
+   buf->base.alignment = desc->alignment;
+   buf->base.vtbl = &malloc_buffer_vtbl;
+
+   buf->data = align_malloc(size, desc->alignment < sizeof(void*) ? sizeof(void*) : desc->alignment);
+   if(!buf->data) {
+      FREE(buf);
+      return NULL;
+   }
+
+   return &buf->base;
+}
+
+
+static struct pb_buffer *
+pb_malloc_bufmgr_create_buffer(struct pb_manager *mgr, 
+                               pb_size size,
+                               const struct pb_desc *desc) 
+{
+   return pb_malloc_buffer_create(size, desc);
+}
+
+
+static void
+pb_malloc_bufmgr_flush(struct pb_manager *mgr) 
+{
+   /* No-op */
+}
+
+
+static void
+pb_malloc_bufmgr_destroy(struct pb_manager *mgr) 
+{
+   /* No-op */
+}
+
+
+static boolean
+pb_malloc_bufmgr_is_buffer_busy( struct pb_manager *mgr,
+                                 struct pb_buffer *buf )
+{
+   return FALSE;
+}
+
+
+static struct pb_manager 
+pb_malloc_bufmgr = {
+   pb_malloc_bufmgr_destroy,
+   pb_malloc_bufmgr_create_buffer,
+   pb_malloc_bufmgr_flush,
+   pb_malloc_bufmgr_is_buffer_busy
+};
+
+
+struct pb_manager *
+pb_malloc_bufmgr_create(void) 
+{
+  return &pb_malloc_bufmgr;
+}
diff --git a/drivers/video/Gallium/auxiliary/pipebuffer/pb_bufmgr.h b/drivers/video/Gallium/auxiliary/pipebuffer/pb_bufmgr.h
new file mode 100644
index 0000000000..960068c494
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/pipebuffer/pb_bufmgr.h
@@ -0,0 +1,218 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Buffer management.
+ * 
+ * A buffer manager does only one basic thing: it creates buffers. Actually,
+ * "buffer factory" would probably a more accurate description.
+ * 
+ * You can chain buffer managers so that you can have a finer grained memory
+ * management and pooling.
+ * 
+ * For example, for a simple batch buffer manager you would chain:
+ * - the native buffer manager, which provides DMA memory from the graphics
+ * memory space;
+ * - the pool buffer manager, which keep around a pool of equally sized buffers
+ * to avoid latency associated with the native buffer manager; 
+ * - the fenced buffer manager, which will delay buffer destruction until the 
+ * the moment the card finishing processing it. 
+ * 
+ * \author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+#ifndef PB_BUFMGR_H_
+#define PB_BUFMGR_H_
+
+
+#include "pb_buffer.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+struct pb_desc;
+
+
+/** 
+ * Abstract base class for all buffer managers.
+ */
+struct pb_manager
+{
+   void
+   (*destroy)( struct pb_manager *mgr );
+
+   struct pb_buffer *
+   (*create_buffer)( struct pb_manager *mgr, 
+	             pb_size size,
+	             const struct pb_desc *desc);
+
+   /**
+    * Flush all temporary-held buffers.
+    * 
+    * Used mostly to aid debugging memory issues or to clean up resources when 
+    * the drivers are long lived.
+    */
+   void
+   (*flush)( struct pb_manager *mgr );
+
+   boolean
+   (*is_buffer_busy)( struct pb_manager *mgr,
+                      struct pb_buffer *buf );
+};
+
+
+/**
+ * Malloc buffer provider.
+ * 
+ * Simple wrapper around pb_malloc_buffer_create for convenience.
+ */
+struct pb_manager *
+pb_malloc_bufmgr_create(void);
+
+
+/** 
+ * Static buffer pool sub-allocator.
+ * 
+ * Manages the allocation of equally sized buffers. It does so by allocating
+ * a single big buffer and divide it equally sized buffers. 
+ * 
+ * It is meant to manage the allocation of batch buffer pools.
+ */
+struct pb_manager *
+pool_bufmgr_create(struct pb_manager *provider, 
+                   pb_size n, pb_size size,
+                   const struct pb_desc *desc);
+
+
+/** 
+ * Static sub-allocator based the old memory manager.
+ * 
+ * It managers buffers of different sizes. It does so by allocating a buffer
+ * with the size of the heap, and then using the old mm memory manager to manage
+ * that heap. 
+ */
+struct pb_manager *
+mm_bufmgr_create(struct pb_manager *provider, 
+                 pb_size size, pb_size align2);
+
+/**
+ * Same as mm_bufmgr_create.
+ * 
+ * Buffer will be release when the manager is destroyed.
+ */
+struct pb_manager *
+mm_bufmgr_create_from_buffer(struct pb_buffer *buffer, 
+                             pb_size size, pb_size align2);
+
+
+/**
+ * Slab sub-allocator.
+ */
+struct pb_manager *
+pb_slab_manager_create(struct pb_manager *provider,
+                       pb_size bufSize,
+                       pb_size slabSize,
+                       const struct pb_desc *desc);
+
+/**
+ * Allow a range of buffer size, by aggregating multiple slabs sub-allocators 
+ * with different bucket sizes.
+ */
+struct pb_manager *
+pb_slab_range_manager_create(struct pb_manager *provider,
+                             pb_size minBufSize,
+                             pb_size maxBufSize,
+                             pb_size slabSize,
+                             const struct pb_desc *desc);
+
+
+/** 
+ * Time-based buffer cache.
+ *
+ * This manager keeps a cache of destroyed buffers during a time interval. 
+ */
+struct pb_manager *
+pb_cache_manager_create(struct pb_manager *provider, 
+                     	unsigned usecs); 
+
+
+struct pb_fence_ops;
+
+/** 
+ * Fenced buffer manager.
+ *
+ * This manager is just meant for convenience. It wraps the buffers returned
+ * by another manager in fenced buffers, so that  
+ * 
+ * NOTE: the buffer manager that provides the buffers will be destroyed
+ * at the same time.
+ */
+struct pb_manager *
+fenced_bufmgr_create(struct pb_manager *provider,
+                     struct pb_fence_ops *ops,
+                     pb_size max_buffer_size,
+                     pb_size max_cpu_total_size);
+
+
+struct pb_manager *
+pb_alt_manager_create(struct pb_manager *provider1, 
+                      struct pb_manager *provider2);
+
+
+/** 
+ * Ondemand buffer manager.
+ * 
+ * Buffers are created in malloc'ed memory (fast and cached), and the constents
+ * is transfered to a buffer from the provider (typically in slow uncached 
+ * memory) when there is an attempt to validate the buffer.
+ * 
+ * Ideal for situations where one does not know before hand whether a given
+ * buffer will effectively be used by the hardware or not. 
+ */
+struct pb_manager *
+pb_ondemand_manager_create(struct pb_manager *provider); 
+
+
+/** 
+ * Debug buffer manager to detect buffer under- and overflows.
+ *
+ * Under/overflow sizes should be a multiple of the largest alignment
+ */
+struct pb_manager *
+pb_debug_manager_create(struct pb_manager *provider,
+                        pb_size underflow_size, pb_size overflow_size); 
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*PB_BUFMGR_H_*/
diff --git a/drivers/video/Gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c b/drivers/video/Gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c
new file mode 100644
index 0000000000..f60c836f18
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c
@@ -0,0 +1,120 @@
+/**************************************************************************
+ *
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Allocate buffers from two alternative buffer providers.
+ * 
+ * \author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+
+#include "pb_buffer.h"
+#include "pb_bufmgr.h"
+
+
+struct pb_alt_manager
+{
+   struct pb_manager base;
+
+   struct pb_manager *provider1;
+   struct pb_manager *provider2;
+};
+
+
+static INLINE struct pb_alt_manager *
+pb_alt_manager(struct pb_manager *mgr)
+{
+   assert(mgr);
+   return (struct pb_alt_manager *)mgr;
+}
+
+
+static struct pb_buffer *
+pb_alt_manager_create_buffer(struct pb_manager *_mgr, 
+                             pb_size size,
+                             const struct pb_desc *desc)
+{
+   struct pb_alt_manager *mgr = pb_alt_manager(_mgr);
+   struct pb_buffer *buf;
+   
+   buf = mgr->provider1->create_buffer(mgr->provider1, size, desc);
+   if(buf)
+      return buf;
+   
+   buf = mgr->provider2->create_buffer(mgr->provider2, size, desc);
+   return buf;
+}
+
+
+static void
+pb_alt_manager_flush(struct pb_manager *_mgr)
+{
+   struct pb_alt_manager *mgr = pb_alt_manager(_mgr);
+   
+   assert(mgr->provider1->flush);
+   if(mgr->provider1->flush)
+      mgr->provider1->flush(mgr->provider1);
+   
+   assert(mgr->provider2->flush);
+   if(mgr->provider2->flush)
+      mgr->provider2->flush(mgr->provider2);
+}
+
+
+static void
+pb_alt_manager_destroy(struct pb_manager *mgr)
+{
+   FREE(mgr);
+}
+
+
+struct pb_manager *
+pb_alt_manager_create(struct pb_manager *provider1, 
+                      struct pb_manager *provider2)
+{
+   struct pb_alt_manager *mgr;
+
+   if(!provider1 || !provider2)
+      return NULL;
+   
+   mgr = CALLOC_STRUCT(pb_alt_manager);
+   if (!mgr)
+      return NULL;
+
+   mgr->base.destroy = pb_alt_manager_destroy;
+   mgr->base.create_buffer = pb_alt_manager_create_buffer;
+   mgr->base.flush = pb_alt_manager_flush;
+   mgr->provider1 = provider1;
+   mgr->provider2 = provider2;
+      
+   return &mgr->base;
+}
diff --git a/drivers/video/Gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c b/drivers/video/Gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
new file mode 100644
index 0000000000..0e6896afd0
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
@@ -0,0 +1,411 @@
+/**************************************************************************
+ *
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Buffer cache.
+ * 
+ * \author Jose Fonseca <jrfonseca-at-tungstengraphics-dot-com>
+ * \author Thomas Hellström <thomas-at-tungstengraphics-dot-com>
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "util/u_debug.h"
+#include "os/os_thread.h"
+#include "util/u_memory.h"
+#include "util/u_double_list.h"
+#include "util/u_time.h"
+
+#include "pb_buffer.h"
+#include "pb_bufmgr.h"
+
+
+/**
+ * Convenience macro (type safe).
+ */
+#define SUPER(__derived) (&(__derived)->base)
+
+
+struct pb_cache_manager;
+
+
+/**
+ * Wrapper around a pipe buffer which adds delayed destruction.
+ */
+struct pb_cache_buffer
+{
+   struct pb_buffer base;
+   
+   struct pb_buffer *buffer;
+   struct pb_cache_manager *mgr;
+
+   /** Caching time interval */
+   int64_t start, end;
+
+   struct list_head head;
+};
+
+
+struct pb_cache_manager
+{
+   struct pb_manager base;
+
+   struct pb_manager *provider;
+   unsigned usecs;
+   
+   pipe_mutex mutex;
+   
+   struct list_head delayed;
+   pb_size numDelayed;
+};
+
+
+static INLINE struct pb_cache_buffer *
+pb_cache_buffer(struct pb_buffer *buf)
+{
+   assert(buf);
+   return (struct pb_cache_buffer *)buf;
+}
+
+
+static INLINE struct pb_cache_manager *
+pb_cache_manager(struct pb_manager *mgr)
+{
+   assert(mgr);
+   return (struct pb_cache_manager *)mgr;
+}
+
+
+/**
+ * Actually destroy the buffer.
+ */
+static INLINE void
+_pb_cache_buffer_destroy(struct pb_cache_buffer *buf)
+{
+   struct pb_cache_manager *mgr = buf->mgr;
+
+   LIST_DEL(&buf->head);
+   assert(mgr->numDelayed);
+   --mgr->numDelayed;
+   assert(!pipe_is_referenced(&buf->base.reference));
+   pb_reference(&buf->buffer, NULL);
+   FREE(buf);
+}
+
+
+/**
+ * Free as many cache buffers from the list head as possible. 
+ */
+static void
+_pb_cache_buffer_list_check_free(struct pb_cache_manager *mgr)
+{
+   struct list_head *curr, *next;
+   struct pb_cache_buffer *buf;
+   int64_t now;
+   
+   now = os_time_get();
+   
+   curr = mgr->delayed.next;
+   next = curr->next;
+   while(curr != &mgr->delayed) {
+      buf = LIST_ENTRY(struct pb_cache_buffer, curr, head);
+
+      if(!os_time_timeout(buf->start, buf->end, now))
+	 break;
+	 
+      _pb_cache_buffer_destroy(buf);
+
+      curr = next; 
+      next = curr->next;
+   }
+}
+
+
+static void
+pb_cache_buffer_destroy(struct pb_buffer *_buf)
+{
+   struct pb_cache_buffer *buf = pb_cache_buffer(_buf);   
+   struct pb_cache_manager *mgr = buf->mgr;
+
+   pipe_mutex_lock(mgr->mutex);
+   assert(!pipe_is_referenced(&buf->base.reference));
+   
+   _pb_cache_buffer_list_check_free(mgr);
+   
+   buf->start = os_time_get();
+   buf->end = buf->start + mgr->usecs;
+   LIST_ADDTAIL(&buf->head, &mgr->delayed);
+   ++mgr->numDelayed;
+   pipe_mutex_unlock(mgr->mutex);
+}
+
+
+static void *
+pb_cache_buffer_map(struct pb_buffer *_buf, 
+		    unsigned flags, void *flush_ctx)
+{
+   struct pb_cache_buffer *buf = pb_cache_buffer(_buf);   
+   return pb_map(buf->buffer, flags, flush_ctx);
+}
+
+
+static void
+pb_cache_buffer_unmap(struct pb_buffer *_buf)
+{
+   struct pb_cache_buffer *buf = pb_cache_buffer(_buf);   
+   pb_unmap(buf->buffer);
+}
+
+
+static enum pipe_error 
+pb_cache_buffer_validate(struct pb_buffer *_buf, 
+                         struct pb_validate *vl,
+                         unsigned flags)
+{
+   struct pb_cache_buffer *buf = pb_cache_buffer(_buf);
+   return pb_validate(buf->buffer, vl, flags);
+}
+
+
+static void
+pb_cache_buffer_fence(struct pb_buffer *_buf, 
+                      struct pipe_fence_handle *fence)
+{
+   struct pb_cache_buffer *buf = pb_cache_buffer(_buf);
+   pb_fence(buf->buffer, fence);
+}
+
+
+static void
+pb_cache_buffer_get_base_buffer(struct pb_buffer *_buf,
+                              struct pb_buffer **base_buf,
+                              pb_size *offset)
+{
+   struct pb_cache_buffer *buf = pb_cache_buffer(_buf);
+   pb_get_base_buffer(buf->buffer, base_buf, offset);
+}
+
+
+const struct pb_vtbl 
+pb_cache_buffer_vtbl = {
+      pb_cache_buffer_destroy,
+      pb_cache_buffer_map,
+      pb_cache_buffer_unmap,
+      pb_cache_buffer_validate,
+      pb_cache_buffer_fence,
+      pb_cache_buffer_get_base_buffer
+};
+
+
+static INLINE int
+pb_cache_is_buffer_compat(struct pb_cache_buffer *buf,  
+                          pb_size size,
+                          const struct pb_desc *desc)
+{
+   if(buf->base.size < size)
+      return 0;
+
+   /* be lenient with size */
+   if(buf->base.size >= 2*size)
+      return 0;
+   
+   if(!pb_check_alignment(desc->alignment, buf->base.alignment))
+      return 0;
+   
+   if(!pb_check_usage(desc->usage, buf->base.usage))
+      return 0;
+
+   if (buf->mgr->provider->is_buffer_busy) {
+      if (buf->mgr->provider->is_buffer_busy(buf->mgr->provider, buf->buffer))
+         return -1;
+   } else {
+      void *ptr = pb_map(buf->buffer, PB_USAGE_DONTBLOCK, NULL);
+
+      if (!ptr)
+         return -1;
+
+      pb_unmap(buf->buffer);
+   }
+
+   return 1;
+}
+
+
+static struct pb_buffer *
+pb_cache_manager_create_buffer(struct pb_manager *_mgr, 
+                               pb_size size,
+                               const struct pb_desc *desc)
+{
+   struct pb_cache_manager *mgr = pb_cache_manager(_mgr);
+   struct pb_cache_buffer *buf;
+   struct pb_cache_buffer *curr_buf;
+   struct list_head *curr, *next;
+   int64_t now;
+   int ret = 0;
+
+   pipe_mutex_lock(mgr->mutex);
+
+   buf = NULL;
+   curr = mgr->delayed.next;
+   next = curr->next;
+   
+   /* search in the expired buffers, freeing them in the process */
+   now = os_time_get();
+   while(curr != &mgr->delayed) {
+      curr_buf = LIST_ENTRY(struct pb_cache_buffer, curr, head);
+      if(!buf && (ret = pb_cache_is_buffer_compat(curr_buf, size, desc) > 0))
+         buf = curr_buf;
+      else if(os_time_timeout(curr_buf->start, curr_buf->end, now))
+         _pb_cache_buffer_destroy(curr_buf);
+      else
+         /* This buffer (and all hereafter) are still hot in cache */
+         break;
+      if (ret == -1)
+         break;
+      curr = next; 
+      next = curr->next;
+   }
+
+   /* keep searching in the hot buffers */
+   if(!buf && ret != -1) {
+      while(curr != &mgr->delayed) {
+         curr_buf = LIST_ENTRY(struct pb_cache_buffer, curr, head);
+         ret = pb_cache_is_buffer_compat(curr_buf, size, desc);
+         if (ret > 0) {
+            buf = curr_buf;
+            break;
+         }
+         if (ret == -1)
+            break;
+         /* no need to check the timeout here */
+         curr = next;
+         next = curr->next;
+      }
+   }
+   
+   if(buf) {
+      LIST_DEL(&buf->head);
+      --mgr->numDelayed;
+      pipe_mutex_unlock(mgr->mutex);
+      /* Increase refcount */
+      pipe_reference_init(&buf->base.reference, 1);
+      return &buf->base;
+   }
+   
+   pipe_mutex_unlock(mgr->mutex);
+
+   buf = CALLOC_STRUCT(pb_cache_buffer);
+   if(!buf)
+      return NULL;
+   
+   buf->buffer = mgr->provider->create_buffer(mgr->provider, size, desc);
+
+   /* Empty the cache and try again. */
+   if (!buf->buffer) {
+      mgr->base.flush(&mgr->base);
+      buf->buffer = mgr->provider->create_buffer(mgr->provider, size, desc);
+   }
+
+   if(!buf->buffer) {
+      FREE(buf);
+      return NULL;
+   }
+   
+   assert(pipe_is_referenced(&buf->buffer->reference));
+   assert(pb_check_alignment(desc->alignment, buf->buffer->alignment));
+   assert(pb_check_usage(desc->usage, buf->buffer->usage));
+   assert(buf->buffer->size >= size);
+   
+   pipe_reference_init(&buf->base.reference, 1);
+   buf->base.alignment = buf->buffer->alignment;
+   buf->base.usage = buf->buffer->usage;
+   buf->base.size = buf->buffer->size;
+   
+   buf->base.vtbl = &pb_cache_buffer_vtbl;
+   buf->mgr = mgr;
+   
+   return &buf->base;
+}
+
+
+static void
+pb_cache_manager_flush(struct pb_manager *_mgr)
+{
+   struct pb_cache_manager *mgr = pb_cache_manager(_mgr);
+   struct list_head *curr, *next;
+   struct pb_cache_buffer *buf;
+
+   pipe_mutex_lock(mgr->mutex);
+   curr = mgr->delayed.next;
+   next = curr->next;
+   while(curr != &mgr->delayed) {
+      buf = LIST_ENTRY(struct pb_cache_buffer, curr, head);
+      _pb_cache_buffer_destroy(buf);
+      curr = next; 
+      next = curr->next;
+   }
+   pipe_mutex_unlock(mgr->mutex);
+   
+   assert(mgr->provider->flush);
+   if(mgr->provider->flush)
+      mgr->provider->flush(mgr->provider);
+}
+
+
+static void
+pb_cache_manager_destroy(struct pb_manager *mgr)
+{
+   pb_cache_manager_flush(mgr);
+   FREE(mgr);
+}
+
+
+struct pb_manager *
+pb_cache_manager_create(struct pb_manager *provider, 
+                     	unsigned usecs) 
+{
+   struct pb_cache_manager *mgr;
+
+   if(!provider)
+      return NULL;
+   
+   mgr = CALLOC_STRUCT(pb_cache_manager);
+   if (!mgr)
+      return NULL;
+
+   mgr->base.destroy = pb_cache_manager_destroy;
+   mgr->base.create_buffer = pb_cache_manager_create_buffer;
+   mgr->base.flush = pb_cache_manager_flush;
+   mgr->provider = provider;
+   mgr->usecs = usecs;
+   LIST_INITHEAD(&mgr->delayed);
+   mgr->numDelayed = 0;
+   pipe_mutex_init(mgr->mutex);
+      
+   return &mgr->base;
+}
diff --git a/drivers/video/Gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c b/drivers/video/Gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
new file mode 100644
index 0000000000..567303aa55
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
@@ -0,0 +1,497 @@
+/**************************************************************************
+ *
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Debug buffer manager to detect buffer under- and overflows.
+ * 
+ * \author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "util/u_debug.h"
+#include "os/os_thread.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_double_list.h"
+#include "util/u_time.h"
+#include "util/u_debug_stack.h"
+
+#include "pb_buffer.h"
+#include "pb_bufmgr.h"
+
+
+#ifdef DEBUG
+
+
+#define PB_DEBUG_CREATE_BACKTRACE 8
+#define PB_DEBUG_MAP_BACKTRACE 8
+
+
+/**
+ * Convenience macro (type safe).
+ */
+#define SUPER(__derived) (&(__derived)->base)
+
+
+struct pb_debug_manager;
+
+
+/**
+ * Wrapper around a pipe buffer which adds delayed destruction.
+ */
+struct pb_debug_buffer
+{
+   struct pb_buffer base;
+   
+   struct pb_buffer *buffer;
+   struct pb_debug_manager *mgr;
+   
+   pb_size underflow_size;
+   pb_size overflow_size;
+
+   struct debug_stack_frame create_backtrace[PB_DEBUG_CREATE_BACKTRACE];
+
+   pipe_mutex mutex;
+   unsigned map_count;
+   struct debug_stack_frame map_backtrace[PB_DEBUG_MAP_BACKTRACE];
+   
+   struct list_head head;
+};
+
+
+struct pb_debug_manager
+{
+   struct pb_manager base;
+
+   struct pb_manager *provider;
+
+   pb_size underflow_size;
+   pb_size overflow_size;
+   
+   pipe_mutex mutex;
+   struct list_head list;
+};
+
+
+static INLINE struct pb_debug_buffer *
+pb_debug_buffer(struct pb_buffer *buf)
+{
+   assert(buf);
+   return (struct pb_debug_buffer *)buf;
+}
+
+
+static INLINE struct pb_debug_manager *
+pb_debug_manager(struct pb_manager *mgr)
+{
+   assert(mgr);
+   return (struct pb_debug_manager *)mgr;
+}
+
+
+static const uint8_t random_pattern[32] = {
+   0xaf, 0xcf, 0xa5, 0xa2, 0xc2, 0x63, 0x15, 0x1a, 
+   0x7e, 0xe2, 0x7e, 0x84, 0x15, 0x49, 0xa2, 0x1e,
+   0x49, 0x63, 0xf5, 0x52, 0x74, 0x66, 0x9e, 0xc4, 
+   0x6d, 0xcf, 0x2c, 0x4a, 0x74, 0xe6, 0xfd, 0x94
+};
+
+
+static INLINE void 
+fill_random_pattern(uint8_t *dst, pb_size size)
+{
+   pb_size i = 0;
+   while(size--) {
+      *dst++ = random_pattern[i++];
+      i &= sizeof(random_pattern) - 1;
+   }
+}
+
+
+static INLINE boolean 
+check_random_pattern(const uint8_t *dst, pb_size size, 
+                     pb_size *min_ofs, pb_size *max_ofs) 
+{
+   boolean result = TRUE;
+   pb_size i;
+   *min_ofs = size;
+   *max_ofs = 0;
+   for(i = 0; i < size; ++i) {
+      if(*dst++ != random_pattern[i % sizeof(random_pattern)]) {
+         *min_ofs = MIN2(*min_ofs, i);
+         *max_ofs = MAX2(*max_ofs, i);
+	 result = FALSE;
+      }
+   }
+   return result;
+}
+
+
+static void
+pb_debug_buffer_fill(struct pb_debug_buffer *buf)
+{
+   uint8_t *map;
+   
+   map = pb_map(buf->buffer, PB_USAGE_CPU_WRITE, NULL);
+   assert(map);
+   if(map) {
+      fill_random_pattern(map, buf->underflow_size);
+      fill_random_pattern(map + buf->underflow_size + buf->base.size,
+                          buf->overflow_size);
+      pb_unmap(buf->buffer);
+   }
+}
+
+
+/**
+ * Check for under/over flows.
+ * 
+ * Should be called with the buffer unmaped.
+ */
+static void
+pb_debug_buffer_check(struct pb_debug_buffer *buf)
+{
+   uint8_t *map;
+   
+   map = pb_map(buf->buffer,
+                PB_USAGE_CPU_READ |
+                PB_USAGE_UNSYNCHRONIZED, NULL);
+   assert(map);
+   if(map) {
+      boolean underflow, overflow;
+      pb_size min_ofs, max_ofs;
+      
+      underflow = !check_random_pattern(map, buf->underflow_size, 
+                                        &min_ofs, &max_ofs);
+      if(underflow) {
+         debug_printf("buffer underflow (offset -%u%s to -%u bytes) detected\n",
+                      buf->underflow_size - min_ofs,
+                      min_ofs == 0 ? "+" : "",
+                      buf->underflow_size - max_ofs);
+      }
+      
+      overflow = !check_random_pattern(map + buf->underflow_size + buf->base.size,
+                                       buf->overflow_size, 
+                                       &min_ofs, &max_ofs);
+      if(overflow) {
+         debug_printf("buffer overflow (size %u plus offset %u to %u%s bytes) detected\n",
+                      buf->base.size,
+                      min_ofs,
+                      max_ofs,
+                      max_ofs == buf->overflow_size - 1 ? "+" : "");
+      }
+      
+      if(underflow || overflow)
+         debug_backtrace_dump(buf->create_backtrace, PB_DEBUG_CREATE_BACKTRACE);
+
+      debug_assert(!underflow);
+      debug_assert(!overflow);
+
+      /* re-fill if not aborted */
+      if(underflow)
+         fill_random_pattern(map, buf->underflow_size);
+      if(overflow)
+         fill_random_pattern(map + buf->underflow_size + buf->base.size,
+                             buf->overflow_size);
+
+      pb_unmap(buf->buffer);
+   }
+}
+
+
+static void
+pb_debug_buffer_destroy(struct pb_buffer *_buf)
+{
+   struct pb_debug_buffer *buf = pb_debug_buffer(_buf);
+   struct pb_debug_manager *mgr = buf->mgr;
+   
+   assert(!pipe_is_referenced(&buf->base.reference));
+   
+   pb_debug_buffer_check(buf);
+
+   pipe_mutex_lock(mgr->mutex);
+   LIST_DEL(&buf->head);
+   pipe_mutex_unlock(mgr->mutex);
+
+   pipe_mutex_destroy(buf->mutex);
+   
+   pb_reference(&buf->buffer, NULL);
+   FREE(buf);
+}
+
+
+static void *
+pb_debug_buffer_map(struct pb_buffer *_buf, 
+                    unsigned flags, void *flush_ctx)
+{
+   struct pb_debug_buffer *buf = pb_debug_buffer(_buf);
+   void *map;
+   
+   pb_debug_buffer_check(buf);
+
+   map = pb_map(buf->buffer, flags, flush_ctx);
+   if(!map)
+      return NULL;
+   
+   if(map) {
+      pipe_mutex_lock(buf->mutex);
+      ++buf->map_count;
+      debug_backtrace_capture(buf->map_backtrace, 1, PB_DEBUG_MAP_BACKTRACE);
+      pipe_mutex_unlock(buf->mutex);
+   }
+   
+   return (uint8_t *)map + buf->underflow_size;
+}
+
+
+static void
+pb_debug_buffer_unmap(struct pb_buffer *_buf)
+{
+   struct pb_debug_buffer *buf = pb_debug_buffer(_buf);   
+   
+   pipe_mutex_lock(buf->mutex);
+   assert(buf->map_count);
+   if(buf->map_count)
+      --buf->map_count;
+   pipe_mutex_unlock(buf->mutex);
+   
+   pb_unmap(buf->buffer);
+   
+   pb_debug_buffer_check(buf);
+}
+
+
+static void
+pb_debug_buffer_get_base_buffer(struct pb_buffer *_buf,
+                                struct pb_buffer **base_buf,
+                                pb_size *offset)
+{
+   struct pb_debug_buffer *buf = pb_debug_buffer(_buf);
+   pb_get_base_buffer(buf->buffer, base_buf, offset);
+   *offset += buf->underflow_size;
+}
+
+
+static enum pipe_error 
+pb_debug_buffer_validate(struct pb_buffer *_buf, 
+                         struct pb_validate *vl,
+                         unsigned flags)
+{
+   struct pb_debug_buffer *buf = pb_debug_buffer(_buf);
+   
+   pipe_mutex_lock(buf->mutex);
+   if(buf->map_count) {
+      debug_printf("%s: attempting to validate a mapped buffer\n", __FUNCTION__);
+      debug_printf("last map backtrace is\n");
+      debug_backtrace_dump(buf->map_backtrace, PB_DEBUG_MAP_BACKTRACE);
+   }
+   pipe_mutex_unlock(buf->mutex);
+
+   pb_debug_buffer_check(buf);
+
+   return pb_validate(buf->buffer, vl, flags);
+}
+
+
+static void
+pb_debug_buffer_fence(struct pb_buffer *_buf, 
+                      struct pipe_fence_handle *fence)
+{
+   struct pb_debug_buffer *buf = pb_debug_buffer(_buf);
+   pb_fence(buf->buffer, fence);
+}
+
+
+const struct pb_vtbl 
+pb_debug_buffer_vtbl = {
+      pb_debug_buffer_destroy,
+      pb_debug_buffer_map,
+      pb_debug_buffer_unmap,
+      pb_debug_buffer_validate,
+      pb_debug_buffer_fence,
+      pb_debug_buffer_get_base_buffer
+};
+
+
+static void
+pb_debug_manager_dump_locked(struct pb_debug_manager *mgr)
+{
+   struct list_head *curr, *next;
+   struct pb_debug_buffer *buf;
+
+   curr = mgr->list.next;
+   next = curr->next;
+   while(curr != &mgr->list) {
+      buf = LIST_ENTRY(struct pb_debug_buffer, curr, head);
+
+      debug_printf("buffer = %p\n", (void *) buf);
+      debug_printf("    .size = 0x%x\n", buf->base.size);
+      debug_backtrace_dump(buf->create_backtrace, PB_DEBUG_CREATE_BACKTRACE);
+      
+      curr = next; 
+      next = curr->next;
+   }
+
+}
+
+
+static struct pb_buffer *
+pb_debug_manager_create_buffer(struct pb_manager *_mgr, 
+                               pb_size size,
+                               const struct pb_desc *desc)
+{
+   struct pb_debug_manager *mgr = pb_debug_manager(_mgr);
+   struct pb_debug_buffer *buf;
+   struct pb_desc real_desc;
+   pb_size real_size;
+   
+   assert(size);
+   assert(desc->alignment);
+
+   buf = CALLOC_STRUCT(pb_debug_buffer);
+   if(!buf)
+      return NULL;
+   
+   real_size = mgr->underflow_size + size + mgr->overflow_size;
+   real_desc = *desc;
+   real_desc.usage |= PB_USAGE_CPU_WRITE;
+   real_desc.usage |= PB_USAGE_CPU_READ;
+
+   buf->buffer = mgr->provider->create_buffer(mgr->provider, 
+                                              real_size, 
+                                              &real_desc);
+   if(!buf->buffer) {
+      FREE(buf);
+#if 0
+      pipe_mutex_lock(mgr->mutex);
+      debug_printf("%s: failed to create buffer\n", __FUNCTION__);
+      if(!LIST_IS_EMPTY(&mgr->list))
+         pb_debug_manager_dump_locked(mgr);
+      pipe_mutex_unlock(mgr->mutex);
+#endif
+      return NULL;
+   }
+   
+   assert(pipe_is_referenced(&buf->buffer->reference));
+   assert(pb_check_alignment(real_desc.alignment, buf->buffer->alignment));
+   assert(pb_check_usage(real_desc.usage, buf->buffer->usage));
+   assert(buf->buffer->size >= real_size);
+   
+   pipe_reference_init(&buf->base.reference, 1);
+   buf->base.alignment = desc->alignment;
+   buf->base.usage = desc->usage;
+   buf->base.size = size;
+   
+   buf->base.vtbl = &pb_debug_buffer_vtbl;
+   buf->mgr = mgr;
+
+   buf->underflow_size = mgr->underflow_size;
+   buf->overflow_size = buf->buffer->size - buf->underflow_size - size;
+   
+   debug_backtrace_capture(buf->create_backtrace, 1, PB_DEBUG_CREATE_BACKTRACE);
+
+   pb_debug_buffer_fill(buf);
+   
+   pipe_mutex_init(buf->mutex);
+   
+   pipe_mutex_lock(mgr->mutex);
+   LIST_ADDTAIL(&buf->head, &mgr->list);
+   pipe_mutex_unlock(mgr->mutex);
+
+   return &buf->base;
+}
+
+
+static void
+pb_debug_manager_flush(struct pb_manager *_mgr)
+{
+   struct pb_debug_manager *mgr = pb_debug_manager(_mgr);
+   assert(mgr->provider->flush);
+   if(mgr->provider->flush)
+      mgr->provider->flush(mgr->provider);
+}
+
+
+static void
+pb_debug_manager_destroy(struct pb_manager *_mgr)
+{
+   struct pb_debug_manager *mgr = pb_debug_manager(_mgr);
+   
+   pipe_mutex_lock(mgr->mutex);
+   if(!LIST_IS_EMPTY(&mgr->list)) {
+      debug_printf("%s: unfreed buffers\n", __FUNCTION__);
+      pb_debug_manager_dump_locked(mgr);
+   }
+   pipe_mutex_unlock(mgr->mutex);
+   
+   pipe_mutex_destroy(mgr->mutex);
+   mgr->provider->destroy(mgr->provider);
+   FREE(mgr);
+}
+
+
+struct pb_manager *
+pb_debug_manager_create(struct pb_manager *provider, 
+                        pb_size underflow_size, pb_size overflow_size) 
+{
+   struct pb_debug_manager *mgr;
+
+   if(!provider)
+      return NULL;
+   
+   mgr = CALLOC_STRUCT(pb_debug_manager);
+   if (!mgr)
+      return NULL;
+
+   mgr->base.destroy = pb_debug_manager_destroy;
+   mgr->base.create_buffer = pb_debug_manager_create_buffer;
+   mgr->base.flush = pb_debug_manager_flush;
+   mgr->provider = provider;
+   mgr->underflow_size = underflow_size;
+   mgr->overflow_size = overflow_size;
+    
+   pipe_mutex_init(mgr->mutex);
+   LIST_INITHEAD(&mgr->list);
+
+   return &mgr->base;
+}
+
+
+#else /* !DEBUG */
+
+
+struct pb_manager *
+pb_debug_manager_create(struct pb_manager *provider, 
+                        pb_size underflow_size, pb_size overflow_size) 
+{
+   return provider;
+}
+
+
+#endif /* !DEBUG */
diff --git a/drivers/video/Gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/drivers/video/Gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
new file mode 100644
index 0000000000..453cf45b86
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
@@ -0,0 +1,320 @@
+/**************************************************************************
+ *
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Buffer manager using the old texture memory manager.
+ * 
+ * \author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_defines.h"
+#include "util/u_debug.h"
+#include "os/os_thread.h"
+#include "util/u_memory.h"
+#include "util/u_double_list.h"
+#include "util/u_mm.h"
+#include "pb_buffer.h"
+#include "pb_bufmgr.h"
+
+
+/**
+ * Convenience macro (type safe).
+ */
+#define SUPER(__derived) (&(__derived)->base)
+
+
+struct mm_pb_manager
+{
+   struct pb_manager base;
+   
+   pipe_mutex mutex;
+   
+   pb_size size;
+   struct mem_block *heap;
+   
+   pb_size align2;
+   
+   struct pb_buffer *buffer;
+   void *map;
+};
+
+
+static INLINE struct mm_pb_manager *
+mm_pb_manager(struct pb_manager *mgr)
+{
+   assert(mgr);
+   return (struct mm_pb_manager *)mgr;
+}
+
+
+struct mm_buffer
+{
+   struct pb_buffer base;
+   
+   struct mm_pb_manager *mgr;
+   
+   struct mem_block *block;
+};
+
+
+static INLINE struct mm_buffer *
+mm_buffer(struct pb_buffer *buf)
+{
+   assert(buf);
+   return (struct mm_buffer *)buf;
+}
+
+
+static void
+mm_buffer_destroy(struct pb_buffer *buf)
+{
+   struct mm_buffer *mm_buf = mm_buffer(buf);
+   struct mm_pb_manager *mm = mm_buf->mgr;
+   
+   assert(!pipe_is_referenced(&mm_buf->base.reference));
+   
+   pipe_mutex_lock(mm->mutex);
+   u_mmFreeMem(mm_buf->block);
+   FREE(mm_buf);
+   pipe_mutex_unlock(mm->mutex);
+}
+
+
+static void *
+mm_buffer_map(struct pb_buffer *buf,
+              unsigned flags,
+              void *flush_ctx)
+{
+   struct mm_buffer *mm_buf = mm_buffer(buf);
+   struct mm_pb_manager *mm = mm_buf->mgr;
+
+   /* XXX: it will be necessary to remap here to propagate flush_ctx */
+
+   return (unsigned char *) mm->map + mm_buf->block->ofs;
+}
+
+
+static void
+mm_buffer_unmap(struct pb_buffer *buf)
+{
+   /* No-op */
+}
+
+
+static enum pipe_error 
+mm_buffer_validate(struct pb_buffer *buf, 
+                   struct pb_validate *vl,
+                   unsigned flags)
+{
+   struct mm_buffer *mm_buf = mm_buffer(buf);
+   struct mm_pb_manager *mm = mm_buf->mgr;
+   return pb_validate(mm->buffer, vl, flags);
+}
+
+
+static void
+mm_buffer_fence(struct pb_buffer *buf, 
+                struct pipe_fence_handle *fence)
+{
+   struct mm_buffer *mm_buf = mm_buffer(buf);
+   struct mm_pb_manager *mm = mm_buf->mgr;
+   pb_fence(mm->buffer, fence);
+}
+
+
+static void
+mm_buffer_get_base_buffer(struct pb_buffer *buf,
+                          struct pb_buffer **base_buf,
+                          pb_size *offset)
+{
+   struct mm_buffer *mm_buf = mm_buffer(buf);
+   struct mm_pb_manager *mm = mm_buf->mgr;
+   pb_get_base_buffer(mm->buffer, base_buf, offset);
+   *offset += mm_buf->block->ofs;
+}
+
+
+static const struct pb_vtbl 
+mm_buffer_vtbl = {
+      mm_buffer_destroy,
+      mm_buffer_map,
+      mm_buffer_unmap,
+      mm_buffer_validate,
+      mm_buffer_fence,
+      mm_buffer_get_base_buffer
+};
+
+
+static struct pb_buffer *
+mm_bufmgr_create_buffer(struct pb_manager *mgr, 
+                        pb_size size,
+                        const struct pb_desc *desc)
+{
+   struct mm_pb_manager *mm = mm_pb_manager(mgr);
+   struct mm_buffer *mm_buf;
+
+   /* We don't handle alignments larger then the one initially setup */
+   assert(pb_check_alignment(desc->alignment, (pb_size)1 << mm->align2));
+   if(!pb_check_alignment(desc->alignment, (pb_size)1 << mm->align2))
+      return NULL;
+   
+   pipe_mutex_lock(mm->mutex);
+
+   mm_buf = CALLOC_STRUCT(mm_buffer);
+   if (!mm_buf) {
+      pipe_mutex_unlock(mm->mutex);
+      return NULL;
+   }
+
+   pipe_reference_init(&mm_buf->base.reference, 1);
+   mm_buf->base.alignment = desc->alignment;
+   mm_buf->base.usage = desc->usage;
+   mm_buf->base.size = size;
+   
+   mm_buf->base.vtbl = &mm_buffer_vtbl;
+   
+   mm_buf->mgr = mm;
+   
+   mm_buf->block = u_mmAllocMem(mm->heap, (int)size, (int)mm->align2, 0);
+   if(!mm_buf->block) {
+#if 0
+      debug_printf("warning: heap full\n");
+      mmDumpMemInfo(mm->heap);
+#endif
+      FREE(mm_buf);
+      pipe_mutex_unlock(mm->mutex);
+      return NULL;
+   }
+   
+   /* Some sanity checks */
+   assert(0 <= (pb_size)mm_buf->block->ofs && (pb_size)mm_buf->block->ofs < mm->size);
+   assert(size <= (pb_size)mm_buf->block->size && (pb_size)mm_buf->block->ofs + (pb_size)mm_buf->block->size <= mm->size);
+   
+   pipe_mutex_unlock(mm->mutex);
+   return SUPER(mm_buf);
+}
+
+
+static void
+mm_bufmgr_flush(struct pb_manager *mgr)
+{
+   /* No-op */
+}
+
+
+static void
+mm_bufmgr_destroy(struct pb_manager *mgr)
+{
+   struct mm_pb_manager *mm = mm_pb_manager(mgr);
+   
+   pipe_mutex_lock(mm->mutex);
+
+   u_mmDestroy(mm->heap);
+   
+   pb_unmap(mm->buffer);
+   pb_reference(&mm->buffer, NULL);
+   
+   pipe_mutex_unlock(mm->mutex);
+   
+   FREE(mgr);
+}
+
+
+struct pb_manager *
+mm_bufmgr_create_from_buffer(struct pb_buffer *buffer, 
+                             pb_size size, pb_size align2) 
+{
+   struct mm_pb_manager *mm;
+
+   if(!buffer)
+      return NULL;
+   
+   mm = CALLOC_STRUCT(mm_pb_manager);
+   if (!mm)
+      return NULL;
+
+   mm->base.destroy = mm_bufmgr_destroy;
+   mm->base.create_buffer = mm_bufmgr_create_buffer;
+   mm->base.flush = mm_bufmgr_flush;
+
+   mm->size = size;
+   mm->align2 = align2; /* 64-byte alignment */
+
+   pipe_mutex_init(mm->mutex);
+
+   mm->buffer = buffer; 
+
+   mm->map = pb_map(mm->buffer, 
+		    PB_USAGE_CPU_READ |
+		    PB_USAGE_CPU_WRITE, NULL);
+   if(!mm->map)
+      goto failure;
+
+   mm->heap = u_mmInit(0, (int)size); 
+   if (!mm->heap)
+      goto failure;
+
+   return SUPER(mm);
+   
+failure:
+if(mm->heap)
+   u_mmDestroy(mm->heap);
+   if(mm->map)
+      pb_unmap(mm->buffer);
+   FREE(mm);
+   return NULL;
+}
+
+
+struct pb_manager *
+mm_bufmgr_create(struct pb_manager *provider, 
+                 pb_size size, pb_size align2) 
+{
+   struct pb_buffer *buffer;
+   struct pb_manager *mgr;
+   struct pb_desc desc;
+
+   if(!provider)
+      return NULL;
+   
+   memset(&desc, 0, sizeof(desc));
+   desc.alignment = 1 << align2;
+   
+   buffer = provider->create_buffer(provider, size, &desc); 
+   if (!buffer)
+      return NULL;
+   
+   mgr = mm_bufmgr_create_from_buffer(buffer, size, align2);
+   if (!mgr) {
+      pb_reference(&buffer, NULL);
+      return NULL;
+   }
+
+  return mgr;
+}
diff --git a/drivers/video/Gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c b/drivers/video/Gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c
new file mode 100644
index 0000000000..e44bc9b17e
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c
@@ -0,0 +1,305 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * A variation of malloc buffers which get transferred to real graphics memory
+ * when there is an attempt to validate them. 
+ * 
+ * @author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+#include "pb_buffer.h"
+#include "pb_bufmgr.h"
+
+
+struct pb_ondemand_manager;
+
+
+struct pb_ondemand_buffer 
+{
+   struct pb_buffer base;
+   
+   struct pb_ondemand_manager *mgr;
+   
+   /** Regular malloc'ed memory */
+   void *data;
+   unsigned mapcount;
+   
+   /** Real buffer */
+   struct pb_buffer *buffer;
+   pb_size size;
+   struct pb_desc desc;
+};
+
+
+struct pb_ondemand_manager
+{
+   struct pb_manager base;
+   
+   struct pb_manager *provider;
+};
+
+
+extern const struct pb_vtbl pb_ondemand_buffer_vtbl;
+
+static INLINE struct pb_ondemand_buffer *
+pb_ondemand_buffer(struct pb_buffer *buf)
+{
+   assert(buf);
+   if (!buf)
+      return NULL;
+   assert(buf->vtbl == &pb_ondemand_buffer_vtbl);
+   return (struct pb_ondemand_buffer *)buf;
+}
+
+static INLINE struct pb_ondemand_manager *
+pb_ondemand_manager(struct pb_manager *mgr)
+{
+   assert(mgr);
+   return (struct pb_ondemand_manager *)mgr;
+}
+
+
+static void
+pb_ondemand_buffer_destroy(struct pb_buffer *_buf)
+{
+   struct pb_ondemand_buffer *buf = pb_ondemand_buffer(_buf);
+   
+   pb_reference(&buf->buffer, NULL);
+   
+   align_free(buf->data);
+   
+   FREE(buf);
+}
+
+
+static void *
+pb_ondemand_buffer_map(struct pb_buffer *_buf, 
+                       unsigned flags, void *flush_ctx)
+{
+   struct pb_ondemand_buffer *buf = pb_ondemand_buffer(_buf);
+
+   if(buf->buffer) {
+      assert(!buf->data);
+      return pb_map(buf->buffer, flags, flush_ctx);
+   }
+   else {
+      assert(buf->data);
+      ++buf->mapcount;
+      return buf->data;
+   }
+}
+
+
+static void
+pb_ondemand_buffer_unmap(struct pb_buffer *_buf)
+{
+   struct pb_ondemand_buffer *buf = pb_ondemand_buffer(_buf);
+
+   if(buf->buffer) {
+      assert(!buf->data);
+      pb_unmap(buf->buffer);
+   }
+   else {
+      assert(buf->data);
+      assert(buf->mapcount);
+      if(buf->mapcount)
+         --buf->mapcount;
+   }
+}
+
+
+static enum pipe_error 
+pb_ondemand_buffer_instantiate(struct pb_ondemand_buffer *buf)
+{
+   if(!buf->buffer) {
+      struct pb_manager *provider = buf->mgr->provider;
+      uint8_t *map;
+      
+      assert(!buf->mapcount);
+      
+      buf->buffer = provider->create_buffer(provider, buf->size, &buf->desc);
+      if(!buf->buffer)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+      
+      map = pb_map(buf->buffer, PB_USAGE_CPU_READ, NULL);
+      if(!map) {
+         pb_reference(&buf->buffer, NULL);
+         return PIPE_ERROR;
+      }
+      
+      memcpy(map, buf->data, buf->size);
+      
+      pb_unmap(buf->buffer);
+      
+      if(!buf->mapcount) {
+         FREE(buf->data);
+         buf->data = NULL;
+      }
+   }
+   
+   return PIPE_OK;
+}
+
+static enum pipe_error 
+pb_ondemand_buffer_validate(struct pb_buffer *_buf, 
+                            struct pb_validate *vl,
+                            unsigned flags)
+{
+   struct pb_ondemand_buffer *buf = pb_ondemand_buffer(_buf);
+   enum pipe_error ret; 
+
+   assert(!buf->mapcount);
+   if(buf->mapcount)
+      return PIPE_ERROR;
+
+   ret = pb_ondemand_buffer_instantiate(buf);
+   if(ret != PIPE_OK)
+      return ret;
+   
+   return pb_validate(buf->buffer, vl, flags);
+}
+
+
+static void
+pb_ondemand_buffer_fence(struct pb_buffer *_buf, 
+                         struct pipe_fence_handle *fence)
+{
+   struct pb_ondemand_buffer *buf = pb_ondemand_buffer(_buf);
+   
+   assert(buf->buffer);
+   if(!buf->buffer)
+      return;
+   
+   pb_fence(buf->buffer, fence);
+}
+
+
+static void
+pb_ondemand_buffer_get_base_buffer(struct pb_buffer *_buf,
+                                   struct pb_buffer **base_buf,
+                                   pb_size *offset)
+{
+   struct pb_ondemand_buffer *buf = pb_ondemand_buffer(_buf);
+
+   if(pb_ondemand_buffer_instantiate(buf) != PIPE_OK) {
+      assert(0);
+      *base_buf = &buf->base;
+      *offset = 0;
+      return;
+   }
+
+   pb_get_base_buffer(buf->buffer, base_buf, offset);
+}
+
+
+const struct pb_vtbl 
+pb_ondemand_buffer_vtbl = {
+      pb_ondemand_buffer_destroy,
+      pb_ondemand_buffer_map,
+      pb_ondemand_buffer_unmap,
+      pb_ondemand_buffer_validate,
+      pb_ondemand_buffer_fence,
+      pb_ondemand_buffer_get_base_buffer
+};
+
+
+static struct pb_buffer *
+pb_ondemand_manager_create_buffer(struct pb_manager *_mgr, 
+                                  pb_size size,
+                                  const struct pb_desc *desc) 
+{
+   struct pb_ondemand_manager *mgr = pb_ondemand_manager(_mgr);
+   struct pb_ondemand_buffer *buf;
+   
+   buf = CALLOC_STRUCT(pb_ondemand_buffer);
+   if(!buf)
+      return NULL;
+
+   pipe_reference_init(&buf->base.reference, 1);
+   buf->base.alignment = desc->alignment;
+   buf->base.usage = desc->usage;
+   buf->base.size = size;
+   buf->base.vtbl = &pb_ondemand_buffer_vtbl;
+   
+   buf->mgr = mgr;
+   
+   buf->data = align_malloc(size, desc->alignment < sizeof(void*) ? sizeof(void*) : desc->alignment);
+   if(!buf->data) {
+      FREE(buf);
+      return NULL;
+   }
+   
+   buf->size = size;
+   buf->desc = *desc;
+
+   return &buf->base;
+}
+
+
+static void
+pb_ondemand_manager_flush(struct pb_manager *_mgr) 
+{
+   struct pb_ondemand_manager *mgr = pb_ondemand_manager(_mgr);
+   
+   mgr->provider->flush(mgr->provider);
+}
+
+
+static void
+pb_ondemand_manager_destroy(struct pb_manager *_mgr) 
+{
+   struct pb_ondemand_manager *mgr = pb_ondemand_manager(_mgr);
+
+   FREE(mgr);
+}
+
+
+struct pb_manager *
+pb_ondemand_manager_create(struct pb_manager *provider) 
+{
+   struct pb_ondemand_manager *mgr;
+
+   if(!provider)
+      return NULL;
+   
+   mgr = CALLOC_STRUCT(pb_ondemand_manager);
+   if(!mgr)
+      return NULL;
+   
+   mgr->base.destroy = pb_ondemand_manager_destroy;
+   mgr->base.create_buffer = pb_ondemand_manager_create_buffer;
+   mgr->base.flush = pb_ondemand_manager_flush;
+   
+   mgr->provider = provider;
+
+   return &mgr->base;
+}
diff --git a/drivers/video/Gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c b/drivers/video/Gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
new file mode 100644
index 0000000000..67a19fe8a6
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
@@ -0,0 +1,321 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * 
+ **************************************************************************/
+
+/**
+ * \file
+ * Batch buffer pool management.
+ * 
+ * \author Jose Fonseca <jrfonseca-at-tungstengraphics-dot-com>
+ * \author Thomas Hellström <thomas-at-tungstengraphics-dot-com>
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "util/u_debug.h"
+#include "os/os_thread.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "util/u_double_list.h"
+
+#include "pb_buffer.h"
+#include "pb_bufmgr.h"
+
+
+/**
+ * Convenience macro (type safe).
+ */
+#define SUPER(__derived) (&(__derived)->base)
+
+
+struct pool_pb_manager
+{
+   struct pb_manager base;
+   
+   pipe_mutex mutex;
+   
+   pb_size bufSize;
+   pb_size bufAlign;
+   
+   pb_size numFree;
+   pb_size numTot;
+   
+   struct list_head free;
+   
+   struct pb_buffer *buffer;
+   void *map;
+   
+   struct pool_buffer *bufs;
+};
+
+
+static INLINE struct pool_pb_manager *
+pool_pb_manager(struct pb_manager *mgr)
+{
+   assert(mgr);
+   return (struct pool_pb_manager *)mgr;
+}
+
+
+struct pool_buffer
+{
+   struct pb_buffer base;
+   
+   struct pool_pb_manager *mgr;
+   
+   struct list_head head;
+   
+   pb_size start;
+};
+
+
+static INLINE struct pool_buffer *
+pool_buffer(struct pb_buffer *buf)
+{
+   assert(buf);
+   return (struct pool_buffer *)buf;
+}
+
+
+
+static void
+pool_buffer_destroy(struct pb_buffer *buf)
+{
+   struct pool_buffer *pool_buf = pool_buffer(buf);
+   struct pool_pb_manager *pool = pool_buf->mgr;
+   
+   assert(!pipe_is_referenced(&pool_buf->base.reference));
+
+   pipe_mutex_lock(pool->mutex);
+   LIST_ADD(&pool_buf->head, &pool->free);
+   pool->numFree++;
+   pipe_mutex_unlock(pool->mutex);
+}
+
+
+static void *
+pool_buffer_map(struct pb_buffer *buf, unsigned flags, void *flush_ctx)
+{
+   struct pool_buffer *pool_buf = pool_buffer(buf);
+   struct pool_pb_manager *pool = pool_buf->mgr;
+   void *map;
+
+   /* XXX: it will be necessary to remap here to propagate flush_ctx */
+
+   pipe_mutex_lock(pool->mutex);
+   map = (unsigned char *) pool->map + pool_buf->start;
+   pipe_mutex_unlock(pool->mutex);
+   return map;
+}
+
+
+static void
+pool_buffer_unmap(struct pb_buffer *buf)
+{
+   /* No-op */
+}
+
+
+static enum pipe_error 
+pool_buffer_validate(struct pb_buffer *buf, 
+                     struct pb_validate *vl,
+                     unsigned flags)
+{
+   struct pool_buffer *pool_buf = pool_buffer(buf);
+   struct pool_pb_manager *pool = pool_buf->mgr;
+   return pb_validate(pool->buffer, vl, flags);
+}
+
+
+static void
+pool_buffer_fence(struct pb_buffer *buf, 
+                  struct pipe_fence_handle *fence)
+{
+   struct pool_buffer *pool_buf = pool_buffer(buf);
+   struct pool_pb_manager *pool = pool_buf->mgr;
+   pb_fence(pool->buffer, fence);
+}
+
+
+static void
+pool_buffer_get_base_buffer(struct pb_buffer *buf,
+                            struct pb_buffer **base_buf,
+                            pb_size *offset)
+{
+   struct pool_buffer *pool_buf = pool_buffer(buf);
+   struct pool_pb_manager *pool = pool_buf->mgr;
+   pb_get_base_buffer(pool->buffer, base_buf, offset);
+   *offset += pool_buf->start;
+}
+
+
+static const struct pb_vtbl 
+pool_buffer_vtbl = {
+      pool_buffer_destroy,
+      pool_buffer_map,
+      pool_buffer_unmap,
+      pool_buffer_validate,
+      pool_buffer_fence,
+      pool_buffer_get_base_buffer
+};
+
+
+static struct pb_buffer *
+pool_bufmgr_create_buffer(struct pb_manager *mgr,
+                          pb_size size,
+                          const struct pb_desc *desc)
+{
+   struct pool_pb_manager *pool = pool_pb_manager(mgr);
+   struct pool_buffer *pool_buf;
+   struct list_head *item;
+
+   assert(size == pool->bufSize);
+   assert(pool->bufAlign % desc->alignment == 0);
+   
+   pipe_mutex_lock(pool->mutex);
+
+   if (pool->numFree == 0) {
+      pipe_mutex_unlock(pool->mutex);
+      debug_printf("warning: out of fixed size buffer objects\n");
+      return NULL;
+   }
+
+   item = pool->free.next;
+
+   if (item == &pool->free) {
+      pipe_mutex_unlock(pool->mutex);
+      debug_printf("error: fixed size buffer pool corruption\n");
+      return NULL;
+   }
+
+   LIST_DEL(item);
+   --pool->numFree;
+
+   pipe_mutex_unlock(pool->mutex);
+   
+   pool_buf = LIST_ENTRY(struct pool_buffer, item, head);
+   assert(!pipe_is_referenced(&pool_buf->base.reference));
+   pipe_reference_init(&pool_buf->base.reference, 1);
+   pool_buf->base.alignment = desc->alignment;
+   pool_buf->base.usage = desc->usage;
+   
+   return SUPER(pool_buf);
+}
+
+
+static void
+pool_bufmgr_flush(struct pb_manager *mgr)
+{
+   /* No-op */
+}
+
+
+static void
+pool_bufmgr_destroy(struct pb_manager *mgr)
+{
+   struct pool_pb_manager *pool = pool_pb_manager(mgr);
+   pipe_mutex_lock(pool->mutex);
+
+   FREE(pool->bufs);
+   
+   pb_unmap(pool->buffer);
+   pb_reference(&pool->buffer, NULL);
+   
+   pipe_mutex_unlock(pool->mutex);
+   
+   FREE(mgr);
+}
+
+
+struct pb_manager *
+pool_bufmgr_create(struct pb_manager *provider, 
+                   pb_size numBufs, 
+                   pb_size bufSize,
+                   const struct pb_desc *desc) 
+{
+   struct pool_pb_manager *pool;
+   struct pool_buffer *pool_buf;
+   pb_size i;
+
+   if(!provider)
+      return NULL;
+   
+   pool = CALLOC_STRUCT(pool_pb_manager);
+   if (!pool)
+      return NULL;
+
+   pool->base.destroy = pool_bufmgr_destroy;
+   pool->base.create_buffer = pool_bufmgr_create_buffer;
+   pool->base.flush = pool_bufmgr_flush;
+
+   LIST_INITHEAD(&pool->free);
+
+   pool->numTot = numBufs;
+   pool->numFree = numBufs;
+   pool->bufSize = bufSize;
+   pool->bufAlign = desc->alignment; 
+   
+   pipe_mutex_init(pool->mutex);
+
+   pool->buffer = provider->create_buffer(provider, numBufs*bufSize, desc); 
+   if (!pool->buffer)
+      goto failure;
+
+   pool->map = pb_map(pool->buffer,
+                          PB_USAGE_CPU_READ |
+                          PB_USAGE_CPU_WRITE, NULL);
+   if(!pool->map)
+      goto failure;
+
+   pool->bufs = (struct pool_buffer *)CALLOC(numBufs, sizeof(*pool->bufs));
+   if (!pool->bufs)
+      goto failure;
+
+   pool_buf = pool->bufs;
+   for (i = 0; i < numBufs; ++i) {
+      pipe_reference_init(&pool_buf->base.reference, 0);
+      pool_buf->base.alignment = 0;
+      pool_buf->base.usage = 0;
+      pool_buf->base.size = bufSize;
+      pool_buf->base.vtbl = &pool_buffer_vtbl;
+      pool_buf->mgr = pool;
+      pool_buf->start = i * bufSize;
+      LIST_ADDTAIL(&pool_buf->head, &pool->free);
+      pool_buf++;
+   }
+
+   return SUPER(pool);
+   
+failure:
+   FREE(pool->bufs);
+   if(pool->map)
+      pb_unmap(pool->buffer);
+   if(pool->buffer)
+      pb_reference(&pool->buffer, NULL);
+   FREE(pool);
+   return NULL;
+}
diff --git a/drivers/video/Gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c b/drivers/video/Gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
new file mode 100644
index 0000000000..bd84b622b6
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
@@ -0,0 +1,590 @@
+/**************************************************************************
+ *
+ * Copyright 2006-2008 Tungsten Graphics, Inc., Cedar Park, TX., USA
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, FREE of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * S-lab pool implementation.
+ * 
+ * @sa http://en.wikipedia.org/wiki/Slab_allocation
+ * 
+ * @author Thomas Hellstrom <thomas-at-tungstengraphics-dot-com>
+ * @author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+#include "pipe/p_compiler.h"
+#include "util/u_debug.h"
+#include "os/os_thread.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "util/u_double_list.h"
+#include "util/u_time.h"
+
+#include "pb_buffer.h"
+#include "pb_bufmgr.h"
+
+
+struct pb_slab;
+
+
+/**
+ * Buffer in a slab.
+ * 
+ * Sub-allocation of a contiguous buffer.
+ */
+struct pb_slab_buffer
+{
+   struct pb_buffer base;
+   
+   struct pb_slab *slab;
+   
+   struct list_head head;
+   
+   unsigned mapCount;
+   
+   /** Offset relative to the start of the slab buffer. */
+   pb_size start;
+   
+   /** Use when validating, to signal that all mappings are finished */
+   /* TODO: Actually validation does not reach this stage yet */
+   pipe_condvar event;
+};
+
+
+/**
+ * Slab -- a contiguous piece of memory. 
+ */
+struct pb_slab
+{
+   struct list_head head;
+   struct list_head freeBuffers;
+   pb_size numBuffers;
+   pb_size numFree;
+   
+   struct pb_slab_buffer *buffers;
+   struct pb_slab_manager *mgr;
+   
+   /** Buffer from the provider */
+   struct pb_buffer *bo;
+   
+   void *virtual;   
+};
+
+
+/**
+ * It adds/removes slabs as needed in order to meet the allocation/destruction 
+ * of individual buffers.
+ */
+struct pb_slab_manager 
+{
+   struct pb_manager base;
+   
+   /** From where we get our buffers */
+   struct pb_manager *provider;
+   
+   /** Size of the buffers we hand on downstream */
+   pb_size bufSize;
+   
+   /** Size of the buffers we request upstream */
+   pb_size slabSize;
+   
+   /** 
+    * Alignment, usage to be used to allocate the slab buffers.
+    * 
+    * We can only provide buffers which are consistent (in alignment, usage) 
+    * with this description.   
+    */
+   struct pb_desc desc;
+
+   /** 
+    * Partial slabs
+    * 
+    * Full slabs are not stored in any list. Empty slabs are destroyed 
+    * immediatly.
+    */
+   struct list_head slabs;
+   
+   pipe_mutex mutex;
+};
+
+
+/**
+ * Wrapper around several slabs, therefore capable of handling buffers of 
+ * multiple sizes. 
+ * 
+ * This buffer manager just dispatches buffer allocations to the appropriate slab
+ * manager, according to the requested buffer size, or by passes the slab 
+ * managers altogether for even greater sizes.
+ * 
+ * The data of this structure remains constant after
+ * initialization and thus needs no mutex protection.
+ */
+struct pb_slab_range_manager 
+{
+   struct pb_manager base;
+
+   struct pb_manager *provider;
+   
+   pb_size minBufSize;
+   pb_size maxBufSize;
+   
+   /** @sa pb_slab_manager::desc */ 
+   struct pb_desc desc;
+   
+   unsigned numBuckets;
+   pb_size *bucketSizes;
+   
+   /** Array of pb_slab_manager, one for each bucket size */
+   struct pb_manager **buckets;
+};
+
+
+static INLINE struct pb_slab_buffer *
+pb_slab_buffer(struct pb_buffer *buf)
+{
+   assert(buf);
+   return (struct pb_slab_buffer *)buf;
+}
+
+
+static INLINE struct pb_slab_manager *
+pb_slab_manager(struct pb_manager *mgr)
+{
+   assert(mgr);
+   return (struct pb_slab_manager *)mgr;
+}
+
+
+static INLINE struct pb_slab_range_manager *
+pb_slab_range_manager(struct pb_manager *mgr)
+{
+   assert(mgr);
+   return (struct pb_slab_range_manager *)mgr;
+}
+
+
+/**
+ * Delete a buffer from the slab delayed list and put
+ * it on the slab FREE list.
+ */
+static void
+pb_slab_buffer_destroy(struct pb_buffer *_buf)
+{
+   struct pb_slab_buffer *buf = pb_slab_buffer(_buf);
+   struct pb_slab *slab = buf->slab;
+   struct pb_slab_manager *mgr = slab->mgr;
+   struct list_head *list = &buf->head;
+
+   pipe_mutex_lock(mgr->mutex);
+   
+   assert(!pipe_is_referenced(&buf->base.reference));
+   
+   buf->mapCount = 0;
+
+   LIST_DEL(list);
+   LIST_ADDTAIL(list, &slab->freeBuffers);
+   slab->numFree++;
+
+   if (slab->head.next == &slab->head)
+      LIST_ADDTAIL(&slab->head, &mgr->slabs);
+
+   /* If the slab becomes totally empty, free it */
+   if (slab->numFree == slab->numBuffers) {
+      list = &slab->head;
+      LIST_DELINIT(list);
+      pb_reference(&slab->bo, NULL);
+      FREE(slab->buffers);
+      FREE(slab);
+   }
+
+   pipe_mutex_unlock(mgr->mutex);
+}
+
+
+static void *
+pb_slab_buffer_map(struct pb_buffer *_buf, 
+                   unsigned flags,
+                   void *flush_ctx)
+{
+   struct pb_slab_buffer *buf = pb_slab_buffer(_buf);
+
+   /* XXX: it will be necessary to remap here to propagate flush_ctx */
+
+   ++buf->mapCount;
+   return (void *) ((uint8_t *) buf->slab->virtual + buf->start);
+}
+
+
+static void
+pb_slab_buffer_unmap(struct pb_buffer *_buf)
+{
+   struct pb_slab_buffer *buf = pb_slab_buffer(_buf);
+
+   --buf->mapCount;
+   if (buf->mapCount == 0) 
+       pipe_condvar_broadcast(buf->event);
+}
+
+
+static enum pipe_error 
+pb_slab_buffer_validate(struct pb_buffer *_buf, 
+                         struct pb_validate *vl,
+                         unsigned flags)
+{
+   struct pb_slab_buffer *buf = pb_slab_buffer(_buf);
+   return pb_validate(buf->slab->bo, vl, flags);
+}
+
+
+static void
+pb_slab_buffer_fence(struct pb_buffer *_buf, 
+                      struct pipe_fence_handle *fence)
+{
+   struct pb_slab_buffer *buf = pb_slab_buffer(_buf);
+   pb_fence(buf->slab->bo, fence);
+}
+
+
+static void
+pb_slab_buffer_get_base_buffer(struct pb_buffer *_buf,
+                               struct pb_buffer **base_buf,
+                               pb_size *offset)
+{
+   struct pb_slab_buffer *buf = pb_slab_buffer(_buf);
+   pb_get_base_buffer(buf->slab->bo, base_buf, offset);
+   *offset += buf->start;
+}
+
+
+static const struct pb_vtbl 
+pb_slab_buffer_vtbl = {
+      pb_slab_buffer_destroy,
+      pb_slab_buffer_map,
+      pb_slab_buffer_unmap,
+      pb_slab_buffer_validate,
+      pb_slab_buffer_fence,
+      pb_slab_buffer_get_base_buffer
+};
+
+
+/**
+ * Create a new slab.
+ * 
+ * Called when we ran out of free slabs.
+ */
+static enum pipe_error
+pb_slab_create(struct pb_slab_manager *mgr)
+{
+   struct pb_slab *slab;
+   struct pb_slab_buffer *buf;
+   unsigned numBuffers;
+   unsigned i;
+   enum pipe_error ret;
+
+   slab = CALLOC_STRUCT(pb_slab);
+   if (!slab)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   slab->bo = mgr->provider->create_buffer(mgr->provider, mgr->slabSize, &mgr->desc);
+   if(!slab->bo) {
+      ret = PIPE_ERROR_OUT_OF_MEMORY;
+      goto out_err0;
+   }
+
+   /* Note down the slab virtual address. All mappings are accessed directly 
+    * through this address so it is required that the buffer is pinned. */
+   slab->virtual = pb_map(slab->bo, 
+                          PB_USAGE_CPU_READ |
+                          PB_USAGE_CPU_WRITE, NULL);
+   if(!slab->virtual) {
+      ret = PIPE_ERROR_OUT_OF_MEMORY;
+      goto out_err1;
+   }
+   pb_unmap(slab->bo);
+
+   numBuffers = slab->bo->size / mgr->bufSize;
+
+   slab->buffers = CALLOC(numBuffers, sizeof(*slab->buffers));
+   if (!slab->buffers) {
+      ret = PIPE_ERROR_OUT_OF_MEMORY;
+      goto out_err1;
+   }
+
+   LIST_INITHEAD(&slab->head);
+   LIST_INITHEAD(&slab->freeBuffers);
+   slab->numBuffers = numBuffers;
+   slab->numFree = 0;
+   slab->mgr = mgr;
+
+   buf = slab->buffers;
+   for (i=0; i < numBuffers; ++i) {
+      pipe_reference_init(&buf->base.reference, 0);
+      buf->base.size = mgr->bufSize;
+      buf->base.alignment = 0;
+      buf->base.usage = 0;
+      buf->base.vtbl = &pb_slab_buffer_vtbl;
+      buf->slab = slab;
+      buf->start = i* mgr->bufSize;
+      buf->mapCount = 0;
+      pipe_condvar_init(buf->event);
+      LIST_ADDTAIL(&buf->head, &slab->freeBuffers);
+      slab->numFree++;
+      buf++;
+   }
+
+   /* Add this slab to the list of partial slabs */
+   LIST_ADDTAIL(&slab->head, &mgr->slabs);
+
+   return PIPE_OK;
+
+out_err1: 
+   pb_reference(&slab->bo, NULL);
+out_err0: 
+   FREE(slab);
+   return ret;
+}
+
+
+static struct pb_buffer *
+pb_slab_manager_create_buffer(struct pb_manager *_mgr,
+                              pb_size size,
+                              const struct pb_desc *desc)
+{
+   struct pb_slab_manager *mgr = pb_slab_manager(_mgr);
+   static struct pb_slab_buffer *buf;
+   struct pb_slab *slab;
+   struct list_head *list;
+
+   /* check size */
+   assert(size <= mgr->bufSize);
+   if(size > mgr->bufSize)
+      return NULL;
+   
+   /* check if we can provide the requested alignment */
+   assert(pb_check_alignment(desc->alignment, mgr->desc.alignment));
+   if(!pb_check_alignment(desc->alignment, mgr->desc.alignment))
+      return NULL;
+   assert(pb_check_alignment(desc->alignment, mgr->bufSize));
+   if(!pb_check_alignment(desc->alignment, mgr->bufSize))
+      return NULL;
+
+   assert(pb_check_usage(desc->usage, mgr->desc.usage));
+   if(!pb_check_usage(desc->usage, mgr->desc.usage))
+      return NULL;
+
+   pipe_mutex_lock(mgr->mutex);
+   
+   /* Create a new slab, if we run out of partial slabs */
+   if (mgr->slabs.next == &mgr->slabs) {
+      (void) pb_slab_create(mgr);
+      if (mgr->slabs.next == &mgr->slabs) {
+	 pipe_mutex_unlock(mgr->mutex);
+	 return NULL;
+      }
+   }
+   
+   /* Allocate the buffer from a partial (or just created) slab */
+   list = mgr->slabs.next;
+   slab = LIST_ENTRY(struct pb_slab, list, head);
+   
+   /* If totally full remove from the partial slab list */
+   if (--slab->numFree == 0)
+      LIST_DELINIT(list);
+
+   list = slab->freeBuffers.next;
+   LIST_DELINIT(list);
+
+   pipe_mutex_unlock(mgr->mutex);
+   buf = LIST_ENTRY(struct pb_slab_buffer, list, head);
+   
+   pipe_reference_init(&buf->base.reference, 1);
+   buf->base.alignment = desc->alignment;
+   buf->base.usage = desc->usage;
+   
+   return &buf->base;
+}
+
+
+static void
+pb_slab_manager_flush(struct pb_manager *_mgr)
+{
+   struct pb_slab_manager *mgr = pb_slab_manager(_mgr);
+
+   assert(mgr->provider->flush);
+   if(mgr->provider->flush)
+      mgr->provider->flush(mgr->provider);
+}
+
+
+static void
+pb_slab_manager_destroy(struct pb_manager *_mgr)
+{
+   struct pb_slab_manager *mgr = pb_slab_manager(_mgr);
+
+   /* TODO: cleanup all allocated buffers */
+   FREE(mgr);
+}
+
+
+struct pb_manager *
+pb_slab_manager_create(struct pb_manager *provider,
+                       pb_size bufSize,
+                       pb_size slabSize,
+                       const struct pb_desc *desc)
+{
+   struct pb_slab_manager *mgr;
+
+   mgr = CALLOC_STRUCT(pb_slab_manager);
+   if (!mgr)
+      return NULL;
+
+   mgr->base.destroy = pb_slab_manager_destroy;
+   mgr->base.create_buffer = pb_slab_manager_create_buffer;
+   mgr->base.flush = pb_slab_manager_flush;
+
+   mgr->provider = provider;
+   mgr->bufSize = bufSize;
+   mgr->slabSize = slabSize;
+   mgr->desc = *desc;
+
+   LIST_INITHEAD(&mgr->slabs);
+   
+   pipe_mutex_init(mgr->mutex);
+
+   return &mgr->base;
+}
+
+
+static struct pb_buffer *
+pb_slab_range_manager_create_buffer(struct pb_manager *_mgr,
+                                    pb_size size,
+                                    const struct pb_desc *desc)
+{
+   struct pb_slab_range_manager *mgr = pb_slab_range_manager(_mgr);
+   pb_size bufSize;
+   pb_size reqSize = size;
+   unsigned i;
+
+   if(desc->alignment > reqSize)
+	   reqSize = desc->alignment;
+
+   bufSize = mgr->minBufSize;
+   for (i = 0; i < mgr->numBuckets; ++i) {
+      if(bufSize >= reqSize)
+	 return mgr->buckets[i]->create_buffer(mgr->buckets[i], size, desc);
+      bufSize *= 2;
+   }
+
+   /* Fall back to allocate a buffer object directly from the provider. */
+   return mgr->provider->create_buffer(mgr->provider, size, desc);
+}
+
+
+static void
+pb_slab_range_manager_flush(struct pb_manager *_mgr)
+{
+   struct pb_slab_range_manager *mgr = pb_slab_range_manager(_mgr);
+
+   /* Individual slabs don't hold any temporary buffers so no need to call them */
+   
+   assert(mgr->provider->flush);
+   if(mgr->provider->flush)
+      mgr->provider->flush(mgr->provider);
+}
+
+
+static void
+pb_slab_range_manager_destroy(struct pb_manager *_mgr)
+{
+   struct pb_slab_range_manager *mgr = pb_slab_range_manager(_mgr);
+   unsigned i;
+   
+   for (i = 0; i < mgr->numBuckets; ++i)
+      mgr->buckets[i]->destroy(mgr->buckets[i]);
+   FREE(mgr->buckets);
+   FREE(mgr->bucketSizes);
+   FREE(mgr);
+}
+
+
+struct pb_manager *
+pb_slab_range_manager_create(struct pb_manager *provider,
+                             pb_size minBufSize,
+                             pb_size maxBufSize,
+                             pb_size slabSize,
+                             const struct pb_desc *desc)
+{
+   struct pb_slab_range_manager *mgr;
+   pb_size bufSize;
+   unsigned i;
+
+   if(!provider)
+      return NULL;
+   
+   mgr = CALLOC_STRUCT(pb_slab_range_manager);
+   if (!mgr)
+      goto out_err0;
+
+   mgr->base.destroy = pb_slab_range_manager_destroy;
+   mgr->base.create_buffer = pb_slab_range_manager_create_buffer;
+   mgr->base.flush = pb_slab_range_manager_flush;
+
+   mgr->provider = provider;
+   mgr->minBufSize = minBufSize;
+   mgr->maxBufSize = maxBufSize;
+
+   mgr->numBuckets = 1;
+   bufSize = minBufSize;
+   while(bufSize < maxBufSize) {
+      bufSize *= 2;
+      ++mgr->numBuckets;
+   }
+   
+   mgr->buckets = CALLOC(mgr->numBuckets, sizeof(*mgr->buckets));
+   if (!mgr->buckets)
+      goto out_err1;
+
+   bufSize = minBufSize;
+   for (i = 0; i < mgr->numBuckets; ++i) {
+      mgr->buckets[i] = pb_slab_manager_create(provider, bufSize, slabSize, desc);
+      if(!mgr->buckets[i])
+	 goto out_err2;
+      bufSize *= 2;
+   }
+
+   return &mgr->base;
+
+out_err2: 
+   for (i = 0; i < mgr->numBuckets; ++i)
+      if(mgr->buckets[i])
+	    mgr->buckets[i]->destroy(mgr->buckets[i]);
+   FREE(mgr->buckets);
+out_err1: 
+   FREE(mgr);
+out_err0:
+   return NULL;
+}
diff --git a/drivers/video/Gallium/auxiliary/pipebuffer/pb_validate.c b/drivers/video/Gallium/auxiliary/pipebuffer/pb_validate.c
new file mode 100644
index 0000000000..b585422460
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/pipebuffer/pb_validate.c
@@ -0,0 +1,192 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Buffer validation.
+ * 
+ * @author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "util/u_debug.h"
+
+#include "pb_buffer.h"
+#include "pb_validate.h"
+
+
+#define PB_VALIDATE_INITIAL_SIZE 1 /* 512 */ 
+
+
+struct pb_validate_entry
+{
+   struct pb_buffer *buf;
+   unsigned flags;
+};
+
+
+struct pb_validate
+{
+   struct pb_validate_entry *entries;
+   unsigned used;
+   unsigned size;
+};
+
+
+enum pipe_error
+pb_validate_add_buffer(struct pb_validate *vl,
+                       struct pb_buffer *buf,
+                       unsigned flags)
+{
+   assert(buf);
+   if(!buf)
+      return PIPE_ERROR;
+
+   assert(flags & PB_USAGE_GPU_READ_WRITE);
+   assert(!(flags & ~PB_USAGE_GPU_READ_WRITE));
+   flags &= PB_USAGE_GPU_READ_WRITE;
+
+   /* We only need to store one reference for each buffer, so avoid storing
+    * consecutive references for the same buffer. It might not be the most 
+    * common pattern, but it is easy to implement.
+    */
+   if(vl->used && vl->entries[vl->used - 1].buf == buf) {
+      vl->entries[vl->used - 1].flags |= flags;
+      return PIPE_OK;
+   }
+   
+   /* Grow the table */
+   if(vl->used == vl->size) {
+      unsigned new_size;
+      struct pb_validate_entry *new_entries;
+      
+      new_size = vl->size * 2;
+      if(!new_size)
+	 return PIPE_ERROR_OUT_OF_MEMORY;
+
+      new_entries = (struct pb_validate_entry *)REALLOC(vl->entries,
+                                                        vl->size*sizeof(struct pb_validate_entry),
+                                                        new_size*sizeof(struct pb_validate_entry));
+      if(!new_entries)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+      
+      memset(new_entries + vl->size, 0, (new_size - vl->size)*sizeof(struct pb_validate_entry));
+      
+      vl->size = new_size;
+      vl->entries = new_entries;
+   }
+   
+   assert(!vl->entries[vl->used].buf);
+   pb_reference(&vl->entries[vl->used].buf, buf);
+   vl->entries[vl->used].flags = flags;
+   ++vl->used;
+   
+   return PIPE_OK;
+}
+
+
+enum pipe_error
+pb_validate_foreach(struct pb_validate *vl,
+                    enum pipe_error (*callback)(struct pb_buffer *buf, void *data),
+                    void *data)
+{
+   unsigned i;
+   for(i = 0; i < vl->used; ++i) {
+      enum pipe_error ret;
+      ret = callback(vl->entries[i].buf, data);
+      if(ret != PIPE_OK)
+         return ret;
+   }
+   return PIPE_OK;
+}
+
+
+enum pipe_error
+pb_validate_validate(struct pb_validate *vl) 
+{
+   unsigned i;
+   
+   for(i = 0; i < vl->used; ++i) {
+      enum pipe_error ret;
+      ret = pb_validate(vl->entries[i].buf, vl, vl->entries[i].flags);
+      if(ret != PIPE_OK) {
+         while(i--)
+            pb_validate(vl->entries[i].buf, NULL, 0);
+         return ret;
+      }
+   }
+
+   return PIPE_OK;
+}
+
+
+void
+pb_validate_fence(struct pb_validate *vl,
+                  struct pipe_fence_handle *fence)
+{
+   unsigned i;
+   for(i = 0; i < vl->used; ++i) {
+      pb_fence(vl->entries[i].buf, fence);
+      pb_reference(&vl->entries[i].buf, NULL);
+   }
+   vl->used = 0;
+}
+
+
+void
+pb_validate_destroy(struct pb_validate *vl)
+{
+   unsigned i;
+   for(i = 0; i < vl->used; ++i)
+      pb_reference(&vl->entries[i].buf, NULL);
+   FREE(vl->entries);
+   FREE(vl);
+}
+
+
+struct pb_validate *
+pb_validate_create()
+{
+   struct pb_validate *vl;
+   
+   vl = CALLOC_STRUCT(pb_validate);
+   if(!vl)
+      return NULL;
+   
+   vl->size = PB_VALIDATE_INITIAL_SIZE;
+   vl->entries = (struct pb_validate_entry *)CALLOC(vl->size, sizeof(struct pb_validate_entry));
+   if(!vl->entries) {
+      FREE(vl);
+      return NULL;
+   }
+
+   return vl;
+}
+
diff --git a/drivers/video/Gallium/auxiliary/pipebuffer/pb_validate.h b/drivers/video/Gallium/auxiliary/pipebuffer/pb_validate.h
new file mode 100644
index 0000000000..3c93f30f20
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/pipebuffer/pb_validate.h
@@ -0,0 +1,97 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Buffer validation.
+ * 
+ * @author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+#ifndef PB_VALIDATE_H_
+#define PB_VALIDATE_H_
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_defines.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+struct pb_buffer;
+struct pipe_fence_handle;
+
+
+/**
+ * Buffer validation list.
+ * 
+ * It holds a list of buffers to be validated and fenced when flushing.
+ */
+struct pb_validate;
+
+
+enum pipe_error
+pb_validate_add_buffer(struct pb_validate *vl,
+                       struct pb_buffer *buf,
+                       unsigned flags);
+
+enum pipe_error
+pb_validate_foreach(struct pb_validate *vl,
+                    enum pipe_error (*callback)(struct pb_buffer *buf, void *data),
+                    void *data);
+
+/**
+ * Validate all buffers for hardware access.
+ * 
+ * Should be called right before issuing commands to the hardware.
+ */
+enum pipe_error
+pb_validate_validate(struct pb_validate *vl);
+
+/**
+ * Fence all buffers and clear the list.
+ * 
+ * Should be called right after issuing commands to the hardware.
+ */
+void
+pb_validate_fence(struct pb_validate *vl,
+                  struct pipe_fence_handle *fence);
+
+struct pb_validate *
+pb_validate_create(void);
+
+void
+pb_validate_destroy(struct pb_validate *vl);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*PB_VALIDATE_H_*/
diff --git a/drivers/video/Gallium/auxiliary/rtasm/rtasm_cpu.c b/drivers/video/Gallium/auxiliary/rtasm/rtasm_cpu.c
new file mode 100644
index 0000000000..7afcf1452b
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/rtasm/rtasm_cpu.c
@@ -0,0 +1,67 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "pipe/p_config.h"
+#include "rtasm_cpu.h"
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+
+#include "util/u_debug.h"
+#include "util/u_cpu_detect.h"
+
+DEBUG_GET_ONCE_BOOL_OPTION(nosse, "GALLIUM_NOSSE", FALSE);
+
+static struct util_cpu_caps *get_cpu_caps(void)
+{
+   util_cpu_detect();
+   return &util_cpu_caps;
+}
+
+int rtasm_cpu_has_sse(void)
+{
+   return !debug_get_option_nosse() && get_cpu_caps()->has_sse;
+}
+
+int rtasm_cpu_has_sse2(void) 
+{
+   return !debug_get_option_nosse() && get_cpu_caps()->has_sse2;
+}
+
+
+#else
+
+int rtasm_cpu_has_sse(void)
+{
+   return 0;
+}
+
+int rtasm_cpu_has_sse2(void)
+{
+   return 0;
+}
+
+#endif
diff --git a/drivers/video/Gallium/auxiliary/rtasm/rtasm_cpu.h b/drivers/video/Gallium/auxiliary/rtasm/rtasm_cpu.h
new file mode 100644
index 0000000000..ebc71634fd
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/rtasm/rtasm_cpu.h
@@ -0,0 +1,42 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Runtime detection of CPU capabilities.
+ */
+
+#ifndef _RTASM_CPU_H_
+#define _RTASM_CPU_H_
+
+
+int rtasm_cpu_has_sse(void);
+
+int rtasm_cpu_has_sse2(void);
+
+
+#endif /* _RTASM_CPU_H_ */
diff --git a/drivers/video/Gallium/auxiliary/rtasm/rtasm_execmem.c b/drivers/video/Gallium/auxiliary/rtasm/rtasm_execmem.c
new file mode 100644
index 0000000000..50de1a992f
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -0,0 +1,102 @@
+/**************************************************************************
+ *
+ * Copyright (C) 1999-2005  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * \file exemem.c
+ * Functions for allocating executable memory.
+ *
+ * \author Keith Whitwell
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "util/u_debug.h"
+#include "os/os_thread.h"
+#include "util/u_memory.h"
+
+#include "rtasm_execmem.h"
+
+#include "util/u_mm.h"
+
+#define EXEC_HEAP_SIZE (4*1024*1024)
+
+pipe_static_mutex(exec_mutex);
+
+static struct mem_block *exec_heap = NULL;
+static unsigned char *exec_mem = NULL;
+
+
+static void
+init_heap(void)
+{
+   if (!exec_heap)
+      exec_heap = u_mmInit( 0, EXEC_HEAP_SIZE );
+   
+   if (!exec_mem)
+      exec_mem = (unsigned char *) user_alloc(EXEC_HEAP_SIZE);
+}
+
+
+void *
+rtasm_exec_malloc(size_t size)
+{
+   struct mem_block *block = NULL;
+   void *addr = NULL;
+
+   pipe_mutex_lock(exec_mutex);
+
+   init_heap();
+
+   if (exec_heap) {
+      size = (size + 31) & ~31;  /* next multiple of 32 bytes */
+      block = u_mmAllocMem( exec_heap, size, 5, 0 ); /* 5 -> 32-byte alignment */
+   }
+
+   if (block)
+      addr = exec_mem + block->ofs;
+   else 
+      debug_printf("rtasm_exec_malloc failed\n");
+   
+   pipe_mutex_unlock(exec_mutex);
+   
+   return addr;
+}
+
+ 
+void 
+rtasm_exec_free(void *addr)
+{
+   pipe_mutex_lock(exec_mutex);
+
+   if (exec_heap) {
+      struct mem_block *block = u_mmFindBlock(exec_heap, (unsigned char *)addr - exec_mem);
+   
+      if (block)
+	 u_mmFreeMem(block);
+   }
+
+   pipe_mutex_unlock(exec_mutex);
+}
+
diff --git a/drivers/video/Gallium/auxiliary/rtasm/rtasm_execmem.h b/drivers/video/Gallium/auxiliary/rtasm/rtasm_execmem.h
new file mode 100644
index 0000000000..5028e63cca
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/rtasm/rtasm_execmem.h
@@ -0,0 +1,46 @@
+/**************************************************************************
+ *
+ * Copyright (C) 1999-2005  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file exemem.c
+ * Functions for allocating executable memory.
+ *
+ * \author Keith Whitwell
+ */
+
+#ifndef _RTASM_EXECMEM_H_
+#define _RTASM_EXECMEM_H_
+
+#include "pipe/p_compiler.h"
+
+
+extern void *
+rtasm_exec_malloc( size_t size );
+
+
+extern void 
+rtasm_exec_free( void *addr );
+
+
+#endif
diff --git a/drivers/video/Gallium/auxiliary/rtasm/rtasm_x86sse.c b/drivers/video/Gallium/auxiliary/rtasm/rtasm_x86sse.c
new file mode 100644
index 0000000000..24ff820a4e
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -0,0 +1,2232 @@
+/**************************************************************************
+ *
+ * Copyright (C) 1999-2005  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "pipe/p_config.h"
+#include "util/u_cpu_detect.h"
+
+#if defined(PIPE_ARCH_X86) || (defined(PIPE_ARCH_X86_64) && !defined(__MINGW32__))
+
+#include "pipe/p_compiler.h"
+#include "util/u_debug.h"
+#include "util/u_pointer.h"
+
+#include "rtasm_execmem.h"
+#include "rtasm_x86sse.h"
+
+#define DISASSEM 0
+#define X86_TWOB 0x0f
+
+
+#define DUMP_SSE  0
+
+
+void x86_print_reg( struct x86_reg reg )
+{
+   if (reg.mod != mod_REG) 
+      debug_printf( "[" );
+      
+   switch( reg.file ) {
+   case file_REG32:
+      switch( reg.idx ) {
+      case reg_AX: debug_printf( "EAX" ); break;
+      case reg_CX: debug_printf( "ECX" ); break;
+      case reg_DX: debug_printf( "EDX" ); break;
+      case reg_BX: debug_printf( "EBX" ); break;
+      case reg_SP: debug_printf( "ESP" ); break;
+      case reg_BP: debug_printf( "EBP" ); break;
+      case reg_SI: debug_printf( "ESI" ); break;
+      case reg_DI: debug_printf( "EDI" ); break;
+      }
+      break;
+   case file_MMX:
+      debug_printf( "MMX%u", reg.idx );
+      break;
+   case file_XMM:
+      debug_printf( "XMM%u", reg.idx );
+      break;
+   case file_x87:
+      debug_printf( "fp%u", reg.idx );
+      break;
+   }
+
+   if (reg.mod == mod_DISP8 ||
+       reg.mod == mod_DISP32)
+      debug_printf("+%d", reg.disp);
+
+   if (reg.mod != mod_REG) 
+      debug_printf( "]" );
+}
+
+#if DUMP_SSE
+
+#define DUMP_START() debug_printf( "\n" )
+#define DUMP_END() debug_printf( "\n" )
+
+#define DUMP() do {                             \
+   const char *foo = __FUNCTION__;              \
+   while (*foo && *foo != '_')                  \
+      foo++;                                    \
+   if  (*foo)                                   \
+      foo++;                                    \
+   debug_printf( "\n%4x %14s ", p->csr - p->store, foo );             \
+} while (0)
+
+#define DUMP_I( I ) do {                        \
+   DUMP();                                      \
+   debug_printf( "%u", I );                     \
+} while( 0 )
+
+#define DUMP_R( R0 ) do {                       \
+   DUMP();                                      \
+   x86_print_reg( R0 );                            \
+} while( 0 )
+
+#define DUMP_RR( R0, R1 ) do {                  \
+   DUMP();                                      \
+   x86_print_reg( R0 );                            \
+   debug_printf( ", " );                        \
+   x86_print_reg( R1 );                            \
+} while( 0 )
+
+#define DUMP_RI( R0, I ) do {                   \
+   DUMP();                                      \
+   x86_print_reg( R0 );                            \
+   debug_printf( ", %u", I );                   \
+} while( 0 )
+
+#define DUMP_RRI( R0, R1, I ) do {              \
+   DUMP();                                      \
+   x86_print_reg( R0 );                            \
+   debug_printf( ", " );                        \
+   x86_print_reg( R1 );                            \
+   debug_printf( ", %u", I );                   \
+} while( 0 )
+
+#else
+
+#define DUMP_START()
+#define DUMP_END()
+#define DUMP( )
+#define DUMP_I( I )
+#define DUMP_R( R0 )
+#define DUMP_RR( R0, R1 )
+#define DUMP_RI( R0, I )
+#define DUMP_RRI( R0, R1, I )
+
+#endif
+
+
+static void do_realloc( struct x86_function *p )
+{
+   if (p->store == p->error_overflow) {
+      p->csr = p->store;
+   }
+   else if (p->size == 0) {
+      p->size = 1024;
+      p->store = rtasm_exec_malloc(p->size);
+      p->csr = p->store;
+   }
+   else {
+      uintptr_t used = pointer_to_uintptr( p->csr ) - pointer_to_uintptr( p->store );
+      unsigned char *tmp = p->store;
+      p->size *= 2;
+      p->store = rtasm_exec_malloc(p->size);
+
+      if (p->store) {
+         memcpy(p->store, tmp, used);
+         p->csr = p->store + used;
+      }
+      else {
+         p->csr = p->store;
+      }
+
+      rtasm_exec_free(tmp);
+   }
+
+   if (p->store == NULL) {
+      p->store = p->csr = p->error_overflow;
+      p->size = sizeof(p->error_overflow);
+   }
+}
+
+/* Emit bytes to the instruction stream:
+ */
+static unsigned char *reserve( struct x86_function *p, int bytes )
+{
+   if (p->csr + bytes - p->store > (int) p->size)
+      do_realloc(p);
+
+   {
+      unsigned char *csr = p->csr;
+      p->csr += bytes;
+      return csr;
+   }
+}
+
+
+
+static void emit_1b( struct x86_function *p, char b0 )
+{
+   char *csr = (char *)reserve(p, 1);
+   *csr = b0;
+}
+
+static void emit_1i( struct x86_function *p, int i0 )
+{
+   int *icsr = (int *)reserve(p, sizeof(i0));
+   *icsr = i0;
+}
+
+static void emit_1ub( struct x86_function *p, unsigned char b0 )
+{
+   unsigned char *csr = reserve(p, 1);
+   *csr++ = b0;
+}
+
+static void emit_2ub( struct x86_function *p, unsigned char b0, unsigned char b1 )
+{
+   unsigned char *csr = reserve(p, 2);
+   *csr++ = b0;
+   *csr++ = b1;
+}
+
+static void emit_3ub( struct x86_function *p, unsigned char b0, unsigned char b1, unsigned char b2 )
+{
+   unsigned char *csr = reserve(p, 3);
+   *csr++ = b0;
+   *csr++ = b1;
+   *csr++ = b2;
+}
+
+
+/* Build a modRM byte + possible displacement.  No treatment of SIB
+ * indexing.  BZZT - no way to encode an absolute address.
+ *
+ * This is the "/r" field in the x86 manuals...
+ */
+static void emit_modrm( struct x86_function *p, 
+			struct x86_reg reg, 
+			struct x86_reg regmem )
+{
+   unsigned char val = 0;
+   
+   assert(reg.mod == mod_REG);
+   
+   /* TODO: support extended x86-64 registers */
+   assert(reg.idx < 8);
+   assert(regmem.idx < 8);
+
+   val |= regmem.mod << 6;     	/* mod field */
+   val |= reg.idx << 3;		/* reg field */
+   val |= regmem.idx;		/* r/m field */
+   
+   emit_1ub(p, val);
+
+   /* Oh-oh we've stumbled into the SIB thing.
+    */
+   if (regmem.file == file_REG32 &&
+       regmem.idx == reg_SP &&
+       regmem.mod != mod_REG) {
+      emit_1ub(p, 0x24);		/* simplistic! */
+   }
+
+   switch (regmem.mod) {
+   case mod_REG:
+   case mod_INDIRECT:
+      break;
+   case mod_DISP8:
+      emit_1b(p, (char) regmem.disp);
+      break;
+   case mod_DISP32:
+      emit_1i(p, regmem.disp);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+/* Emits the "/0".."/7" specialized versions of the modrm ("/r") bytes.
+ */
+static void emit_modrm_noreg( struct x86_function *p,
+			      unsigned op,
+			      struct x86_reg regmem )
+{
+   struct x86_reg dummy = x86_make_reg(file_REG32, op);
+   emit_modrm(p, dummy, regmem);
+}
+
+/* Many x86 instructions have two opcodes to cope with the situations
+ * where the destination is a register or memory reference
+ * respectively.  This function selects the correct opcode based on
+ * the arguments presented.
+ */
+static void emit_op_modrm( struct x86_function *p,
+			   unsigned char op_dst_is_reg, 
+			   unsigned char op_dst_is_mem,
+			   struct x86_reg dst,
+			   struct x86_reg src )
+{  
+   switch (dst.mod) {
+   case mod_REG:
+      emit_1ub(p, op_dst_is_reg);
+      emit_modrm(p, dst, src);
+      break;
+   case mod_INDIRECT:
+   case mod_DISP32:
+   case mod_DISP8:
+      assert(src.mod == mod_REG);
+      emit_1ub(p, op_dst_is_mem);
+      emit_modrm(p, src, dst);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+
+
+
+
+
+
+/* Create and manipulate registers and regmem values:
+ */
+struct x86_reg x86_make_reg( enum x86_reg_file file,
+			     enum x86_reg_name idx )
+{
+   struct x86_reg reg;
+
+   reg.file = file;
+   reg.idx = idx;
+   reg.mod = mod_REG;
+   reg.disp = 0;
+
+   return reg;
+}
+
+struct x86_reg x86_make_disp( struct x86_reg reg,
+			      int disp )
+{
+   assert(reg.file == file_REG32);
+
+   if (reg.mod == mod_REG)
+      reg.disp = disp;
+   else
+      reg.disp += disp;
+
+   if (reg.disp == 0 && reg.idx != reg_BP)
+      reg.mod = mod_INDIRECT;
+   else if (reg.disp <= 127 && reg.disp >= -128)
+      reg.mod = mod_DISP8;
+   else
+      reg.mod = mod_DISP32;
+
+   return reg;
+}
+
+struct x86_reg x86_deref( struct x86_reg reg )
+{
+   return x86_make_disp(reg, 0);
+}
+
+struct x86_reg x86_get_base_reg( struct x86_reg reg )
+{
+   return x86_make_reg( reg.file, reg.idx );
+}
+
+int x86_get_label( struct x86_function *p )
+{
+   return p->csr - p->store;
+}
+
+
+
+/***********************************************************************
+ * x86 instructions
+ */
+
+
+void x64_rexw(struct x86_function *p)
+{
+   if(x86_target(p) != X86_32)
+      emit_1ub(p, 0x48);
+}
+
+void x86_jcc( struct x86_function *p,
+	      enum x86_cc cc,
+	      int label )
+{
+   int offset = label - (x86_get_label(p) + 2);
+   DUMP_I(cc);
+   
+   if (offset < 0) {
+      /*assert(p->csr - p->store > -offset);*/
+      if (p->csr - p->store <= -offset) {
+         /* probably out of memory (using the error_overflow buffer) */
+         return;
+      }
+   }
+
+   if (offset <= 127 && offset >= -128) {
+      emit_1ub(p, 0x70 + cc);
+      emit_1b(p, (char) offset);
+   }
+   else {
+      offset = label - (x86_get_label(p) + 6);
+      emit_2ub(p, 0x0f, 0x80 + cc);
+      emit_1i(p, offset);
+   }
+}
+
+/* Always use a 32bit offset for forward jumps:
+ */
+int x86_jcc_forward( struct x86_function *p,
+                     enum x86_cc cc )
+{
+   DUMP_I(cc);
+   emit_2ub(p, 0x0f, 0x80 + cc);
+   emit_1i(p, 0);
+   return x86_get_label(p);
+}
+
+int x86_jmp_forward( struct x86_function *p)
+{
+   DUMP();
+   emit_1ub(p, 0xe9);
+   emit_1i(p, 0);
+   return x86_get_label(p);
+}
+
+int x86_call_forward( struct x86_function *p)
+{
+   DUMP();
+
+   emit_1ub(p, 0xe8);
+   emit_1i(p, 0);
+   return x86_get_label(p);
+}
+
+/* Fixup offset from forward jump:
+ */
+void x86_fixup_fwd_jump( struct x86_function *p,
+			 int fixup )
+{
+   *(int *)(p->store + fixup - 4) = x86_get_label(p) - fixup;
+}
+
+void x86_jmp( struct x86_function *p, int label)
+{
+   DUMP_I( label );
+   emit_1ub(p, 0xe9);
+   emit_1i(p, label - x86_get_label(p) - 4);
+}
+
+void x86_call( struct x86_function *p, struct x86_reg reg)
+{
+   DUMP_R( reg );
+   emit_1ub(p, 0xff);
+   emit_modrm_noreg(p, 2, reg);
+}
+
+
+void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   assert(dst.file == file_REG32);
+   assert(dst.mod == mod_REG);
+   emit_1ub(p, 0xb8 + dst.idx);
+   emit_1i(p, imm);
+}
+
+void x86_mov_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   if(dst.mod == mod_REG)
+      x86_mov_reg_imm(p, dst, imm);
+   else
+   {
+      emit_1ub(p, 0xc7);
+      emit_modrm_noreg(p, 0, dst);
+      emit_1i(p, imm);
+   }
+}
+
+void x86_mov16_imm( struct x86_function *p, struct x86_reg dst, uint16_t imm )
+{
+   DUMP_RI( dst, imm );
+   emit_1ub(p, 0x66);
+   if(dst.mod == mod_REG)
+   {
+      emit_1ub(p, 0xb8 + dst.idx);
+      emit_2ub(p, imm & 0xff, imm >> 8);
+   }
+   else
+   {
+      emit_1ub(p, 0xc7);
+      emit_modrm_noreg(p, 0, dst);
+      emit_2ub(p, imm & 0xff, imm >> 8);
+   }
+}
+
+void x86_mov8_imm( struct x86_function *p, struct x86_reg dst, uint8_t imm )
+{
+   DUMP_RI( dst, imm );
+   if(dst.mod == mod_REG)
+   {
+      emit_1ub(p, 0xb0 + dst.idx);
+      emit_1ub(p, imm);
+   }
+   else
+   {
+      emit_1ub(p, 0xc6);
+      emit_modrm_noreg(p, 0, dst);
+      emit_1ub(p, imm);
+   }
+}
+
+/**
+ * Immediate group 1 instructions.
+ */
+static INLINE void 
+x86_group1_imm( struct x86_function *p, 
+                unsigned op, struct x86_reg dst, int imm )
+{
+   assert(dst.file == file_REG32);
+   assert(dst.mod == mod_REG);
+   if(-0x80 <= imm && imm < 0x80) {
+      emit_1ub(p, 0x83);
+      emit_modrm_noreg(p, op, dst);
+      emit_1b(p, (char)imm);
+   }
+   else {
+      emit_1ub(p, 0x81);
+      emit_modrm_noreg(p, op, dst);
+      emit_1i(p, imm);
+   }
+}
+
+void x86_add_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 0, dst, imm);
+}
+
+void x86_or_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 1, dst, imm);
+}
+
+void x86_and_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 4, dst, imm);
+}
+
+void x86_sub_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 5, dst, imm);
+}
+
+void x86_xor_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 6, dst, imm);
+}
+
+void x86_cmp_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 7, dst, imm);
+}
+
+
+void x86_push( struct x86_function *p,
+	       struct x86_reg reg )
+{
+   DUMP_R( reg );
+   if (reg.mod == mod_REG)
+      emit_1ub(p, 0x50 + reg.idx);
+   else 
+   {
+      emit_1ub(p, 0xff);
+      emit_modrm_noreg(p, 6, reg);
+   }
+
+
+   p->stack_offset += sizeof(void*);
+}
+
+void x86_push_imm32( struct x86_function *p,
+                     int imm32 )
+{
+   DUMP_I( imm32 );
+   emit_1ub(p, 0x68);
+   emit_1i(p,  imm32);
+
+   p->stack_offset += sizeof(void*);
+}
+
+
+void x86_pop( struct x86_function *p,
+	      struct x86_reg reg )
+{
+   DUMP_R( reg );
+   assert(reg.mod == mod_REG);
+   emit_1ub(p, 0x58 + reg.idx);
+   p->stack_offset -= sizeof(void*);
+}
+
+void x86_inc( struct x86_function *p,
+	      struct x86_reg reg )
+{
+   DUMP_R( reg );
+   if(x86_target(p) == X86_32 && reg.mod == mod_REG)
+   {
+      emit_1ub(p, 0x40 + reg.idx);
+      return;
+   }
+   emit_1ub(p, 0xff);
+   emit_modrm_noreg(p, 0, reg);
+}
+
+void x86_dec( struct x86_function *p,
+	      struct x86_reg reg )
+{
+   DUMP_R( reg );
+   if(x86_target(p) == X86_32 && reg.mod == mod_REG)
+   {
+      emit_1ub(p, 0x48 + reg.idx);
+      return;
+   }
+   emit_1ub(p, 0xff);
+   emit_modrm_noreg(p, 1, reg);
+}
+
+void x86_ret( struct x86_function *p )
+{
+   DUMP();
+   assert(p->stack_offset == 0);
+   emit_1ub(p, 0xc3);
+}
+
+void x86_retw( struct x86_function *p, unsigned short imm )
+{
+   DUMP();
+   emit_3ub(p, 0xc2, imm & 0xff, (imm >> 8) & 0xff);
+}
+
+void x86_sahf( struct x86_function *p )
+{
+   DUMP();
+   emit_1ub(p, 0x9e);
+}
+
+void x86_mov( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   /* special hack for reading arguments until we support x86-64 registers everywhere */
+   if(src.mod == mod_REG && dst.mod == mod_REG && (src.idx >= 8 || dst.idx >= 8))
+   {
+      uint8_t rex = 0x40;
+      if(dst.idx >= 8)
+      {
+         rex |= 4;
+         dst.idx -= 8;
+      }
+      if(src.idx >= 8)
+      {
+         rex |= 1;
+         src.idx -= 8;
+      }
+      emit_1ub(p, rex);
+   }
+   emit_op_modrm( p, 0x8b, 0x89, dst, src );
+}
+
+void x86_mov16( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_1ub(p, 0x66);
+   emit_op_modrm( p, 0x8b, 0x89, dst, src );
+}
+
+void x86_mov8( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_op_modrm( p, 0x8a, 0x88, dst, src );
+}
+
+void x64_mov64( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   uint8_t rex = 0x48;
+   DUMP_RR( dst, src );
+   assert(x86_target(p) != X86_32);
+
+   /* special hack for reading arguments until we support x86-64 registers everywhere */
+   if(src.mod == mod_REG && dst.mod == mod_REG && (src.idx >= 8 || dst.idx >= 8))
+   {
+      if(dst.idx >= 8)
+      {
+         rex |= 4;
+         dst.idx -= 8;
+      }
+      if(src.idx >= 8)
+      {
+         rex |= 1;
+         src.idx -= 8;
+      }
+   }
+   emit_1ub(p, rex);
+   emit_op_modrm( p, 0x8b, 0x89, dst, src );
+}
+
+void x86_movzx8(struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, 0x0f, 0xb6);
+   emit_modrm(p, dst, src);
+}
+
+void x86_movzx16(struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, 0x0f, 0xb7);
+   emit_modrm(p, dst, src);
+}
+
+void x86_cmovcc( struct x86_function *p,
+                 struct x86_reg dst,
+                 struct x86_reg src,
+                 enum x86_cc cc)
+{
+   DUMP_RRI( dst, src, cc );
+   emit_2ub( p, 0x0f, 0x40 + cc );
+   emit_modrm( p, dst, src );
+}
+
+void x86_xor( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_op_modrm( p, 0x33, 0x31, dst, src );
+}
+
+void x86_cmp( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_op_modrm( p, 0x3b, 0x39, dst, src );
+}
+
+void x86_lea( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_1ub(p, 0x8d);
+   emit_modrm( p, dst, src );
+}
+
+void x86_test( struct x86_function *p,
+	       struct x86_reg dst,
+	       struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_1ub(p, 0x85);
+   emit_modrm( p, dst, src );
+}
+
+void x86_add( struct x86_function *p,
+	       struct x86_reg dst,
+	       struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_op_modrm(p, 0x03, 0x01, dst, src );
+}
+
+/* Calculate EAX * src, results in EDX:EAX.
+ */
+void x86_mul( struct x86_function *p,
+	       struct x86_reg src )
+{
+   DUMP_R(  src );
+   emit_1ub(p, 0xf7);
+   emit_modrm_noreg(p, 4, src );
+}
+
+
+void x86_imul( struct x86_function *p,
+	       struct x86_reg dst,
+	       struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0xAF);
+   emit_modrm(p, dst, src);
+}
+
+
+void x86_sub( struct x86_function *p,
+	       struct x86_reg dst,
+	       struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_op_modrm(p, 0x2b, 0x29, dst, src );
+}
+
+void x86_or( struct x86_function *p,
+             struct x86_reg dst,
+             struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_op_modrm( p, 0x0b, 0x09, dst, src );
+}
+
+void x86_and( struct x86_function *p,
+              struct x86_reg dst,
+              struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_op_modrm( p, 0x23, 0x21, dst, src );
+}
+
+void x86_div( struct x86_function *p,
+              struct x86_reg src )
+{
+   assert(src.file == file_REG32 && src.mod == mod_REG);
+   emit_op_modrm(p, 0xf7, 0, x86_make_reg(file_REG32, 6), src);
+}
+
+void x86_bswap( struct x86_function *p, struct x86_reg reg )
+{
+   DUMP_R(reg);
+   assert(reg.file == file_REG32);
+   assert(reg.mod == mod_REG);
+   emit_2ub(p, 0x0f, 0xc8 + reg.idx);
+}
+
+void x86_shr_imm( struct x86_function *p, struct x86_reg reg, unsigned imm )
+{
+   DUMP_RI(reg, imm);
+   if(imm == 1)
+   {
+      emit_1ub(p, 0xd1);
+      emit_modrm_noreg(p, 5, reg);
+   }
+   else
+   {
+      emit_1ub(p, 0xc1);
+      emit_modrm_noreg(p, 5, reg);
+      emit_1ub(p, imm);
+   }
+}
+
+void x86_sar_imm( struct x86_function *p, struct x86_reg reg, unsigned imm )
+{
+   DUMP_RI(reg, imm);
+   if(imm == 1)
+   {
+      emit_1ub(p, 0xd1);
+      emit_modrm_noreg(p, 7, reg);
+   }
+   else
+   {
+      emit_1ub(p, 0xc1);
+      emit_modrm_noreg(p, 7, reg);
+      emit_1ub(p, imm);
+   }
+}
+
+void x86_shl_imm( struct x86_function *p, struct x86_reg reg, unsigned imm  )
+{
+   DUMP_RI(reg, imm);
+   if(imm == 1)
+   {
+      emit_1ub(p, 0xd1);
+      emit_modrm_noreg(p, 4, reg);
+   }
+   else
+   {
+      emit_1ub(p, 0xc1);
+      emit_modrm_noreg(p, 4, reg);
+      emit_1ub(p, imm);
+   }
+}
+
+
+/***********************************************************************
+ * SSE instructions
+ */
+
+void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr)
+{
+   DUMP_R( ptr );
+   assert(ptr.mod != mod_REG);
+   emit_2ub(p, 0x0f, 0x18);
+   emit_modrm_noreg(p, 0, ptr);
+}
+
+void sse_prefetch0( struct x86_function *p, struct x86_reg ptr)
+{
+   DUMP_R( ptr );
+   assert(ptr.mod != mod_REG);
+   emit_2ub(p, 0x0f, 0x18);
+   emit_modrm_noreg(p, 1, ptr);
+}
+
+void sse_prefetch1( struct x86_function *p, struct x86_reg ptr)
+{
+   DUMP_R( ptr );
+   assert(ptr.mod != mod_REG);
+   emit_2ub(p, 0x0f, 0x18);
+   emit_modrm_noreg(p, 2, ptr);
+}
+
+void sse_movntps( struct x86_function *p, 
+                  struct x86_reg dst,
+                  struct x86_reg src)
+{
+   DUMP_RR( dst, src );
+
+   assert(dst.mod != mod_REG);
+   assert(src.mod == mod_REG);
+   emit_2ub(p, 0x0f, 0x2b);
+   emit_modrm(p, src, dst);
+}
+
+
+
+
+void sse_movss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, 0xF3, X86_TWOB);
+   emit_op_modrm( p, 0x10, 0x11, dst, src );
+}
+
+void sse_movaps( struct x86_function *p,
+		 struct x86_reg dst,
+		 struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_1ub(p, X86_TWOB);
+   emit_op_modrm( p, 0x28, 0x29, dst, src );
+}
+
+void sse_movups( struct x86_function *p,
+		 struct x86_reg dst,
+		 struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_1ub(p, X86_TWOB);
+   emit_op_modrm( p, 0x10, 0x11, dst, src );
+}
+
+void sse_movhps( struct x86_function *p,
+		 struct x86_reg dst,
+		 struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   assert(dst.mod != mod_REG || src.mod != mod_REG);
+   emit_1ub(p, X86_TWOB);
+   emit_op_modrm( p, 0x16, 0x17, dst, src ); /* cf movlhps */
+}
+
+void sse_movlps( struct x86_function *p,
+		 struct x86_reg dst,
+		 struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   assert(dst.mod != mod_REG || src.mod != mod_REG);
+   emit_1ub(p, X86_TWOB);
+   emit_op_modrm( p, 0x12, 0x13, dst, src ); /* cf movhlps */
+}
+
+void sse_maxps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x5F);
+   emit_modrm( p, dst, src );
+}
+
+void sse_maxss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0xF3, X86_TWOB, 0x5F);
+   emit_modrm( p, dst, src );
+}
+
+void sse_divss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0xF3, X86_TWOB, 0x5E);
+   emit_modrm( p, dst, src );
+}
+
+void sse_minps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x5D);
+   emit_modrm( p, dst, src );
+}
+
+void sse_subps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x5C);
+   emit_modrm( p, dst, src );
+}
+
+void sse_mulps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x59);
+   emit_modrm( p, dst, src );
+}
+
+void sse_mulss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0xF3, X86_TWOB, 0x59);
+   emit_modrm( p, dst, src );
+}
+
+void sse_addps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x58);
+   emit_modrm( p, dst, src );
+}
+
+void sse_addss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0xF3, X86_TWOB, 0x58);
+   emit_modrm( p, dst, src );
+}
+
+void sse_andnps( struct x86_function *p,
+                 struct x86_reg dst,
+                 struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x55);
+   emit_modrm( p, dst, src );
+}
+
+void sse_andps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x54);
+   emit_modrm( p, dst, src );
+}
+
+void sse_rsqrtps( struct x86_function *p,
+                  struct x86_reg dst,
+                  struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x52);
+   emit_modrm( p, dst, src );
+}
+
+void sse_rsqrtss( struct x86_function *p,
+		  struct x86_reg dst,
+		  struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0xF3, X86_TWOB, 0x52);
+   emit_modrm( p, dst, src );
+
+}
+
+void sse_movhlps( struct x86_function *p,
+		  struct x86_reg dst,
+		  struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   assert(dst.mod == mod_REG && src.mod == mod_REG);
+   emit_2ub(p, X86_TWOB, 0x12);
+   emit_modrm( p, dst, src );
+}
+
+void sse_movlhps( struct x86_function *p,
+		  struct x86_reg dst,
+		  struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   assert(dst.mod == mod_REG && src.mod == mod_REG);
+   emit_2ub(p, X86_TWOB, 0x16);
+   emit_modrm( p, dst, src );
+}
+
+void sse_orps( struct x86_function *p,
+               struct x86_reg dst,
+               struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x56);
+   emit_modrm( p, dst, src );
+}
+
+void sse_xorps( struct x86_function *p,
+                struct x86_reg dst,
+                struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x57);
+   emit_modrm( p, dst, src );
+}
+
+void sse_cvtps2pi( struct x86_function *p,
+		   struct x86_reg dst,
+		   struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   assert(dst.file == file_MMX && 
+	  (src.file == file_XMM || src.mod != mod_REG));
+
+   p->need_emms = 1;
+
+   emit_2ub(p, X86_TWOB, 0x2d);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_cvtdq2ps( struct x86_function *p,
+		   struct x86_reg dst,
+		   struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x5b);
+   emit_modrm( p, dst, src );
+}
+
+
+/* Shufps can also be used to implement a reduced swizzle when dest ==
+ * arg0.
+ */
+void sse_shufps( struct x86_function *p,
+		 struct x86_reg dst,
+		 struct x86_reg src,
+		 unsigned char shuf) 
+{
+   DUMP_RRI( dst, src, shuf );
+   emit_2ub(p, X86_TWOB, 0xC6);
+   emit_modrm(p, dst, src);
+   emit_1ub(p, shuf); 
+}
+
+void sse_unpckhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub( p, X86_TWOB, 0x15 );
+   emit_modrm( p, dst, src );
+}
+
+void sse_unpcklps( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub( p, X86_TWOB, 0x14 );
+   emit_modrm( p, dst, src );
+}
+
+void sse_cmpps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src,
+		enum sse_cc cc) 
+{
+   DUMP_RRI( dst, src, cc );
+   emit_2ub(p, X86_TWOB, 0xC2);
+   emit_modrm(p, dst, src);
+   emit_1ub(p, cc); 
+}
+
+void sse_pmovmskb( struct x86_function *p,
+                   struct x86_reg dst,
+                   struct x86_reg src)
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, X86_TWOB, 0xD7);
+   emit_modrm(p, dst, src);
+}
+
+void sse_movmskps( struct x86_function *p,
+                   struct x86_reg dst,
+                   struct x86_reg src)
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x50);
+   emit_modrm(p, dst, src);
+}
+
+/***********************************************************************
+ * SSE2 instructions
+ */
+
+void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_2ub(p, 0x66, 0x0f);
+   if(dst.mod == mod_REG && dst.file == file_REG32)
+   {
+      emit_1ub(p, 0x7e);
+      emit_modrm(p, src, dst);
+   }
+   else
+   {
+      emit_op_modrm(p, 0x6e, 0x7e, dst, src);
+   }
+}
+
+void sse2_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   switch (dst.mod) {
+   case mod_REG:
+      emit_3ub(p, 0xf3, 0x0f, 0x7e);
+      emit_modrm(p, dst, src);
+      break;
+   case mod_INDIRECT:
+   case mod_DISP32:
+   case mod_DISP8:
+      assert(src.mod == mod_REG);
+      emit_3ub(p, 0x66, 0x0f, 0xd6);
+      emit_modrm(p, src, dst);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+void sse2_movdqu( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_2ub(p, 0xf3, 0x0f);
+   emit_op_modrm(p, 0x6f, 0x7f, dst, src);
+}
+
+void sse2_movdqa( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_2ub(p, 0x66, 0x0f);
+   emit_op_modrm(p, 0x6f, 0x7f, dst, src);
+}
+
+void sse2_movsd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_2ub(p, 0xf2, 0x0f);
+   emit_op_modrm(p, 0x10, 0x11, dst, src);
+}
+
+void sse2_movupd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_2ub(p, 0x66, 0x0f);
+   emit_op_modrm(p, 0x10, 0x11, dst, src);
+}
+
+void sse2_movapd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_2ub(p, 0x66, 0x0f);
+   emit_op_modrm(p, 0x28, 0x29, dst, src);
+}
+
+/**
+ * Perform a reduced swizzle:
+ */
+void sse2_pshufd( struct x86_function *p,
+		  struct x86_reg dst,
+		  struct x86_reg src,
+		  unsigned char shuf) 
+{
+   DUMP_RRI( dst, src, shuf );
+   emit_3ub(p, 0x66, X86_TWOB, 0x70);
+   emit_modrm(p, dst, src);
+   emit_1ub(p, shuf); 
+}
+
+void sse2_pshuflw( struct x86_function *p,
+                  struct x86_reg dst,
+                  struct x86_reg src,
+                  unsigned char shuf)
+{
+   DUMP_RRI( dst, src, shuf );
+   emit_3ub(p, 0xf2, X86_TWOB, 0x70);
+   emit_modrm(p, dst, src);
+   emit_1ub(p, shuf);
+}
+
+void sse2_pshufhw( struct x86_function *p,
+                  struct x86_reg dst,
+                  struct x86_reg src,
+                  unsigned char shuf)
+{
+   DUMP_RRI( dst, src, shuf );
+   emit_3ub(p, 0xf3, X86_TWOB, 0x70);
+   emit_modrm(p, dst, src);
+   emit_1ub(p, shuf);
+}
+
+void sse2_cvttps2dq( struct x86_function *p,
+                     struct x86_reg dst,
+                     struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub( p, 0xF3, X86_TWOB, 0x5B );
+   emit_modrm( p, dst, src );
+}
+
+void sse2_cvtps2dq( struct x86_function *p,
+		    struct x86_reg dst,
+		    struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, X86_TWOB, 0x5B);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_cvtsd2ss( struct x86_function *p,
+                    struct x86_reg dst,
+                    struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0xf2, 0x0f, 0x5a);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_cvtpd2ps( struct x86_function *p,
+                    struct x86_reg dst,
+                    struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, 0x0f, 0x5a);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_packssdw( struct x86_function *p,
+		    struct x86_reg dst,
+		    struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, X86_TWOB, 0x6B);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_packsswb( struct x86_function *p,
+		    struct x86_reg dst,
+		    struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, X86_TWOB, 0x63);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_packuswb( struct x86_function *p,
+		    struct x86_reg dst,
+		    struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, X86_TWOB, 0x67);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_punpcklbw( struct x86_function *p,
+		    struct x86_reg dst,
+		    struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, X86_TWOB, 0x60);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_punpcklwd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, 0x0f, 0x61);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_punpckldq( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, 0x0f, 0x62);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_punpcklqdq( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, 0x0f, 0x6c);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_psllw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x71);
+   emit_modrm_noreg(p, 6, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_pslld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x72);
+   emit_modrm_noreg(p, 6, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_psllq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x73);
+   emit_modrm_noreg(p, 6, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_psrlw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x71);
+   emit_modrm_noreg(p, 2, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_psrld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x72);
+   emit_modrm_noreg(p, 2, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_psrlq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x73);
+   emit_modrm_noreg(p, 2, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_psraw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x71);
+   emit_modrm_noreg(p, 4, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_psrad_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x72);
+   emit_modrm_noreg(p, 4, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_por( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_3ub(p, 0x66, 0x0f, 0xeb);
+   emit_modrm(p, dst, src);
+}
+
+void sse2_rcpps( struct x86_function *p,
+                 struct x86_reg dst,
+                 struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x53);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_rcpss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0xF3, X86_TWOB, 0x53);
+   emit_modrm( p, dst, src );
+}
+
+/***********************************************************************
+ * x87 instructions
+ */
+static void note_x87_pop( struct x86_function *p )
+{
+   p->x87_stack--;
+   assert(p->x87_stack >= 0);
+}
+
+static void note_x87_push( struct x86_function *p )
+{
+   p->x87_stack++;
+   assert(p->x87_stack <= 7);
+}
+
+void x87_assert_stack_empty( struct x86_function *p )
+{
+   assert (p->x87_stack == 0);
+}
+
+
+void x87_fist( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   emit_1ub(p, 0xdb);
+   emit_modrm_noreg(p, 2, dst);
+}
+
+void x87_fistp( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   emit_1ub(p, 0xdb);
+   emit_modrm_noreg(p, 3, dst);
+   note_x87_pop(p);
+}
+
+void x87_fild( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   emit_1ub(p, 0xdf);
+   emit_modrm_noreg(p, 0, arg);
+   note_x87_push(p);
+}
+
+void x87_fldz( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xee);
+   note_x87_push(p);
+}
+
+
+void x87_fldcw( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_REG32);
+   assert(arg.mod != mod_REG);
+   emit_1ub(p, 0xd9);
+   emit_modrm_noreg(p, 5, arg);
+}
+
+void x87_fld1( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xe8);
+   note_x87_push(p);
+}
+
+void x87_fldl2e( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xea);
+   note_x87_push(p);
+}
+
+void x87_fldln2( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xed);
+   note_x87_push(p);
+}
+
+void x87_fwait( struct x86_function *p )
+{
+   DUMP();
+   emit_1ub(p, 0x9b);
+}
+
+void x87_fnclex( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xdb, 0xe2);
+}
+
+void x87_fclex( struct x86_function *p )
+{
+   x87_fwait(p);
+   x87_fnclex(p);
+}
+
+void x87_fcmovb( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xda, 0xc0+arg.idx);
+}
+
+void x87_fcmove( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xda, 0xc8+arg.idx);
+}
+
+void x87_fcmovbe( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xda, 0xd0+arg.idx);
+}
+
+void x87_fcmovnb( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xdb, 0xc0+arg.idx);
+}
+
+void x87_fcmovne( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xdb, 0xc8+arg.idx);
+}
+
+void x87_fcmovnbe( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xdb, 0xd0+arg.idx);
+}
+
+
+
+static void x87_arith_op( struct x86_function *p, struct x86_reg dst, struct x86_reg arg,
+			  unsigned char dst0ub0,
+			  unsigned char dst0ub1,
+			  unsigned char arg0ub0,
+			  unsigned char arg0ub1,
+			  unsigned char argmem_noreg)
+{
+   assert(dst.file == file_x87);
+
+   if (arg.file == file_x87) {
+      if (dst.idx == 0) 
+	 emit_2ub(p, dst0ub0, dst0ub1+arg.idx);
+      else if (arg.idx == 0) 
+	 emit_2ub(p, arg0ub0, arg0ub1+arg.idx);
+      else
+	 assert(0);
+   }
+   else if (dst.idx == 0) {
+      assert(arg.file == file_REG32);
+      emit_1ub(p, 0xd8);
+      emit_modrm_noreg(p, argmem_noreg, arg);
+   }
+   else
+      assert(0);
+}
+
+void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   x87_arith_op(p, dst, src, 
+		0xd8, 0xc8,
+		0xdc, 0xc8,
+		4);
+}
+
+void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   x87_arith_op(p, dst, src, 
+		0xd8, 0xe0,
+		0xdc, 0xe8,
+		4);
+}
+
+void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   x87_arith_op(p, dst, src, 
+		0xd8, 0xe8,
+		0xdc, 0xe0,
+		5);
+}
+
+void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   x87_arith_op(p, dst, src, 
+		0xd8, 0xc0,
+		0xdc, 0xc0,
+		0);
+}
+
+void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   x87_arith_op(p, dst, src, 
+		0xd8, 0xf0,
+		0xdc, 0xf8,
+		6);
+}
+
+void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   x87_arith_op(p, dst, src, 
+		0xd8, 0xf8,
+		0xdc, 0xf0,
+		7);
+}
+
+void x87_fmulp( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   assert(dst.file == file_x87);
+   assert(dst.idx >= 1);
+   emit_2ub(p, 0xde, 0xc8+dst.idx);
+   note_x87_pop(p);
+}
+
+void x87_fsubp( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   assert(dst.file == file_x87);
+   assert(dst.idx >= 1);
+   emit_2ub(p, 0xde, 0xe8+dst.idx);
+   note_x87_pop(p);
+}
+
+void x87_fsubrp( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   assert(dst.file == file_x87);
+   assert(dst.idx >= 1);
+   emit_2ub(p, 0xde, 0xe0+dst.idx);
+   note_x87_pop(p);
+}
+
+void x87_faddp( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   assert(dst.file == file_x87);
+   assert(dst.idx >= 1);
+   emit_2ub(p, 0xde, 0xc0+dst.idx);
+   note_x87_pop(p);
+}
+
+void x87_fdivp( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   assert(dst.file == file_x87);
+   assert(dst.idx >= 1);
+   emit_2ub(p, 0xde, 0xf8+dst.idx);
+   note_x87_pop(p);
+}
+
+void x87_fdivrp( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   assert(dst.file == file_x87);
+   assert(dst.idx >= 1);
+   emit_2ub(p, 0xde, 0xf0+dst.idx);
+   note_x87_pop(p);
+}
+
+void x87_ftst( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xe4);
+}
+
+void x87_fucom( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xdd, 0xe0+arg.idx);
+}
+
+void x87_fucomp( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xdd, 0xe8+arg.idx);
+   note_x87_pop(p);
+}
+
+void x87_fucompp( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xda, 0xe9);
+   note_x87_pop(p);             /* pop twice */
+   note_x87_pop(p);             /* pop twice */
+}
+
+void x87_fxch( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xd9, 0xc8+arg.idx);
+}
+
+void x87_fabs( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xe1);
+}
+
+void x87_fchs( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xe0);
+}
+
+void x87_fcos( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xff);
+}
+
+
+void x87_fprndint( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xfc);
+}
+
+void x87_fscale( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xfd);
+}
+
+void x87_fsin( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xfe);
+}
+
+void x87_fsincos( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xfb);
+}
+
+void x87_fsqrt( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xfa);
+}
+
+void x87_fxtract( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xf4);
+}
+
+/* st0 = (2^st0)-1
+ *
+ * Restrictions: -1.0 <= st0 <= 1.0
+ */
+void x87_f2xm1( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xf0);
+}
+
+/* st1 = st1 * log2(st0);
+ * pop_stack;
+ */
+void x87_fyl2x( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xf1);
+   note_x87_pop(p);
+}
+
+/* st1 = st1 * log2(st0 + 1.0);
+ * pop_stack;
+ *
+ * A fast operation, with restrictions: -.29 < st0 < .29 
+ */
+void x87_fyl2xp1( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xf9);
+   note_x87_pop(p);
+}
+
+
+void x87_fld( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   if (arg.file == file_x87) 
+      emit_2ub(p, 0xd9, 0xc0 + arg.idx);
+   else {
+      emit_1ub(p, 0xd9);
+      emit_modrm_noreg(p, 0, arg);
+   }
+   note_x87_push(p);
+}
+
+void x87_fst( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   if (dst.file == file_x87) 
+      emit_2ub(p, 0xdd, 0xd0 + dst.idx);
+   else {
+      emit_1ub(p, 0xd9);
+      emit_modrm_noreg(p, 2, dst);
+   }
+}
+
+void x87_fstp( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   if (dst.file == file_x87) 
+      emit_2ub(p, 0xdd, 0xd8 + dst.idx);
+   else {
+      emit_1ub(p, 0xd9);
+      emit_modrm_noreg(p, 3, dst);
+   }
+   note_x87_pop(p);
+}
+
+void x87_fpop( struct x86_function *p )
+{
+   x87_fstp( p, x86_make_reg( file_x87, 0 ));
+}
+
+
+void x87_fcom( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   if (dst.file == file_x87) 
+      emit_2ub(p, 0xd8, 0xd0 + dst.idx);
+   else {
+      emit_1ub(p, 0xd8);
+      emit_modrm_noreg(p, 2, dst);
+   }
+}
+
+
+void x87_fcomp( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   if (dst.file == file_x87) 
+      emit_2ub(p, 0xd8, 0xd8 + dst.idx);
+   else {
+      emit_1ub(p, 0xd8);
+      emit_modrm_noreg(p, 3, dst);
+   }
+   note_x87_pop(p);
+}
+
+void x87_fcomi( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   emit_2ub(p, 0xdb, 0xf0+arg.idx);
+}
+
+void x87_fcomip( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   emit_2ub(p, 0xdb, 0xf0+arg.idx);
+   note_x87_pop(p);
+}
+
+
+void x87_fnstsw( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   assert(dst.file == file_REG32);
+
+   if (dst.idx == reg_AX &&
+       dst.mod == mod_REG) 
+      emit_2ub(p, 0xdf, 0xe0);
+   else {
+      emit_1ub(p, 0xdd);
+      emit_modrm_noreg(p, 7, dst);
+   }
+}
+
+
+void x87_fnstcw( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   assert(dst.file == file_REG32);
+
+   emit_1ub(p, 0x9b);           /* WAIT -- needed? */
+   emit_1ub(p, 0xd9);
+   emit_modrm_noreg(p, 7, dst);
+}
+
+
+
+
+/***********************************************************************
+ * MMX instructions
+ */
+
+void mmx_emms( struct x86_function *p )
+{
+   DUMP();
+   assert(p->need_emms);
+   emit_2ub(p, 0x0f, 0x77);
+   p->need_emms = 0;
+}
+
+void mmx_packssdw( struct x86_function *p,
+		   struct x86_reg dst,
+		   struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   assert(dst.file == file_MMX && 
+	  (src.file == file_MMX || src.mod != mod_REG));
+
+   p->need_emms = 1;
+
+   emit_2ub(p, X86_TWOB, 0x6b);
+   emit_modrm( p, dst, src );
+}
+
+void mmx_packuswb( struct x86_function *p,
+		   struct x86_reg dst,
+		   struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   assert(dst.file == file_MMX && 
+	  (src.file == file_MMX || src.mod != mod_REG));
+
+   p->need_emms = 1;
+
+   emit_2ub(p, X86_TWOB, 0x67);
+   emit_modrm( p, dst, src );
+}
+
+void mmx_movd( struct x86_function *p,
+	       struct x86_reg dst,
+	       struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   p->need_emms = 1;
+   emit_1ub(p, X86_TWOB);
+   emit_op_modrm( p, 0x6e, 0x7e, dst, src );
+}
+
+void mmx_movq( struct x86_function *p,
+	       struct x86_reg dst,
+	       struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   p->need_emms = 1;
+   emit_1ub(p, X86_TWOB);
+   emit_op_modrm( p, 0x6f, 0x7f, dst, src );
+}
+
+
+/***********************************************************************
+ * Helper functions
+ */
+
+
+void x86_cdecl_caller_push_regs( struct x86_function *p )
+{
+   x86_push(p, x86_make_reg(file_REG32, reg_AX));
+   x86_push(p, x86_make_reg(file_REG32, reg_CX));
+   x86_push(p, x86_make_reg(file_REG32, reg_DX));
+}
+
+void x86_cdecl_caller_pop_regs( struct x86_function *p )
+{
+   x86_pop(p, x86_make_reg(file_REG32, reg_DX));
+   x86_pop(p, x86_make_reg(file_REG32, reg_CX));
+   x86_pop(p, x86_make_reg(file_REG32, reg_AX));
+}
+
+
+struct x86_reg x86_fn_arg( struct x86_function *p,
+                           unsigned arg )
+{
+   switch(x86_target(p))
+   {
+   case X86_64_WIN64_ABI:
+      /* Microsoft uses a different calling convention than the rest of the world */
+      switch(arg)
+      {
+      case 1:
+         return x86_make_reg(file_REG32, reg_CX);
+      case 2:
+         return x86_make_reg(file_REG32, reg_DX);
+      case 3:
+         return x86_make_reg(file_REG32, reg_R8);
+      case 4:
+         return x86_make_reg(file_REG32, reg_R9);
+      default:
+	 /* Win64 allocates stack slots as if it pushed the first 4 arguments too */
+         return x86_make_disp(x86_make_reg(file_REG32, reg_SP),
+               p->stack_offset + arg * 8);
+      }
+   case X86_64_STD_ABI:
+      switch(arg)
+      {
+      case 1:
+         return x86_make_reg(file_REG32, reg_DI);
+      case 2:
+         return x86_make_reg(file_REG32, reg_SI);
+      case 3:
+         return x86_make_reg(file_REG32, reg_DX);
+      case 4:
+         return x86_make_reg(file_REG32, reg_CX);
+      case 5:
+         return x86_make_reg(file_REG32, reg_R8);
+      case 6:
+         return x86_make_reg(file_REG32, reg_R9);
+      default:
+         return x86_make_disp(x86_make_reg(file_REG32, reg_SP),
+               p->stack_offset + (arg - 6) * 8);     /* ??? */
+      }
+   case X86_32:
+      return x86_make_disp(x86_make_reg(file_REG32, reg_SP),
+			p->stack_offset + arg * 4);	/* ??? */
+   default:
+      assert(0 && "Unexpected x86 target ABI in x86_fn_arg");
+      return x86_make_reg(file_REG32, reg_CX); /* not used / silence warning */
+   }
+}
+
+static void x86_init_func_common( struct x86_function *p )
+{
+   util_cpu_detect();
+   p->caps = 0;
+   if(util_cpu_caps.has_mmx)
+      p->caps |= X86_MMX;
+   if(util_cpu_caps.has_mmx2)
+      p->caps |= X86_MMX2;
+   if(util_cpu_caps.has_sse)
+      p->caps |= X86_SSE;
+   if(util_cpu_caps.has_sse2)
+      p->caps |= X86_SSE2;
+   if(util_cpu_caps.has_sse3)
+      p->caps |= X86_SSE3;
+   if(util_cpu_caps.has_sse4_1)
+      p->caps |= X86_SSE4_1;
+   p->csr = p->store;
+   DUMP_START();
+}
+
+void x86_init_func( struct x86_function *p )
+{
+   p->size = 0;
+   p->store = NULL;
+   x86_init_func_common(p);
+}
+
+void x86_init_func_size( struct x86_function *p, unsigned code_size )
+{
+   p->size = code_size;
+   p->store = rtasm_exec_malloc(code_size);
+   if (p->store == NULL) {
+      p->store = p->error_overflow;
+   }
+   x86_init_func_common(p);
+}
+
+void x86_release_func( struct x86_function *p )
+{
+   if (p->store && p->store != p->error_overflow)
+      rtasm_exec_free(p->store);
+
+   p->store = NULL;
+   p->csr = NULL;
+   p->size = 0;
+}
+
+
+static INLINE x86_func
+voidptr_to_x86_func(void *v)
+{
+   union {
+      void *v;
+      x86_func f;
+   } u;
+   assert(sizeof(u.v) == sizeof(u.f));
+   u.v = v;
+   return u.f;
+}
+
+
+x86_func x86_get_func( struct x86_function *p )
+{
+   DUMP_END();
+   if (DISASSEM && p->store)
+      debug_printf("disassemble %p %p\n", p->store, p->csr);
+
+   if (p->store == p->error_overflow)
+      return voidptr_to_x86_func(NULL);
+   else
+      return voidptr_to_x86_func(p->store);
+}
+
+#else
+
+void x86sse_dummy( void );
+
+void x86sse_dummy( void )
+{
+}
+
+#endif
diff --git a/drivers/video/Gallium/auxiliary/rtasm/rtasm_x86sse.h b/drivers/video/Gallium/auxiliary/rtasm/rtasm_x86sse.h
new file mode 100644
index 0000000000..67c9bdd993
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -0,0 +1,416 @@
+/**************************************************************************
+ *
+ * Copyright (C) 1999-2005  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef _RTASM_X86SSE_H_
+#define _RTASM_X86SSE_H_
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+
+/* It is up to the caller to ensure that instructions issued are
+ * suitable for the host cpu.  There are no checks made in this module
+ * for mmx/sse/sse2 support on the cpu.
+ */
+struct x86_reg {
+   unsigned file:2;
+   unsigned idx:4;
+   unsigned mod:2;		/* mod_REG if this is just a register */
+   int      disp:24;		/* only +/- 23bits of offset - should be enough... */
+};
+
+#define X86_MMX 1
+#define X86_MMX2 2
+#define X86_SSE 4
+#define X86_SSE2 8
+#define X86_SSE3 0x10
+#define X86_SSE4_1 0x20
+
+struct x86_function {
+   unsigned caps;
+   unsigned size;
+   unsigned char *store;
+   unsigned char *csr;
+
+   unsigned stack_offset:16;
+   unsigned need_emms:8;
+   int x87_stack:8;
+
+   unsigned char error_overflow[4];
+};
+
+enum x86_reg_file {
+   file_REG32,
+   file_MMX,
+   file_XMM,
+   file_x87
+};
+
+/* Values for mod field of modr/m byte
+ */
+enum x86_reg_mod {
+   mod_INDIRECT,
+   mod_DISP8,
+   mod_DISP32,
+   mod_REG
+};
+
+enum x86_reg_name {
+   reg_AX,
+   reg_CX,
+   reg_DX,
+   reg_BX,
+   reg_SP,
+   reg_BP,
+   reg_SI,
+   reg_DI,
+   reg_R8,
+   reg_R9,
+   reg_R10,
+   reg_R11,
+   reg_R12,
+   reg_R13,
+   reg_R14,
+   reg_R15
+};
+
+
+enum x86_cc {
+   cc_O,			/* overflow */
+   cc_NO,			/* not overflow */
+   cc_NAE,			/* not above or equal / carry */
+   cc_AE,			/* above or equal / not carry */
+   cc_E,			/* equal / zero */
+   cc_NE			/* not equal / not zero */
+};
+
+enum sse_cc {
+   cc_Equal,
+   cc_LessThan,
+   cc_LessThanEqual,
+   cc_Unordered,
+   cc_NotEqual,
+   cc_NotLessThan,
+   cc_NotLessThanEqual,
+   cc_Ordered
+};
+
+#define cc_Z  cc_E
+#define cc_NZ cc_NE
+
+
+/** generic pointer to function */
+typedef void (*x86_func)(void);
+
+
+/* Begin/end/retrieve function creation:
+ */
+
+enum x86_target
+{
+   X86_32,
+   X86_64_STD_ABI,
+   X86_64_WIN64_ABI
+};
+
+/* make this read a member of x86_function if target != host is desired */
+static INLINE enum x86_target x86_target( struct x86_function* p )
+{
+#ifdef PIPE_ARCH_X86
+   return X86_32;
+#elif defined(_WIN64)
+   return X86_64_WIN64_ABI;
+#elif defined(PIPE_ARCH_X86_64)
+   return X86_64_STD_ABI;
+#endif
+}
+
+static INLINE unsigned x86_target_caps( struct x86_function* p )
+{
+   return p->caps;
+}
+
+void x86_init_func( struct x86_function *p );
+void x86_init_func_size( struct x86_function *p, unsigned code_size );
+void x86_release_func( struct x86_function *p );
+x86_func x86_get_func( struct x86_function *p );
+
+/* Debugging:
+ */
+void x86_print_reg( struct x86_reg reg );
+
+
+/* Create and manipulate registers and regmem values:
+ */
+struct x86_reg x86_make_reg( enum x86_reg_file file,
+			     enum x86_reg_name idx );
+
+struct x86_reg x86_make_disp( struct x86_reg reg,
+			      int disp );
+
+struct x86_reg x86_deref( struct x86_reg reg );
+
+struct x86_reg x86_get_base_reg( struct x86_reg reg );
+
+
+/* Labels, jumps and fixup:
+ */
+int x86_get_label( struct x86_function *p );
+
+void x64_rexw(struct x86_function *p);
+
+void x86_jcc( struct x86_function *p,
+	      enum x86_cc cc,
+	      int label );
+
+int x86_jcc_forward( struct x86_function *p,
+			  enum x86_cc cc );
+
+int x86_jmp_forward( struct x86_function *p);
+
+int x86_call_forward( struct x86_function *p);
+
+void x86_fixup_fwd_jump( struct x86_function *p,
+			 int fixup );
+
+void x86_jmp( struct x86_function *p, int label );
+
+/* void x86_call( struct x86_function *p, void (*label)() ); */
+void x86_call( struct x86_function *p, struct x86_reg reg);
+
+void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_add_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_or_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_and_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_sub_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_xor_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_cmp_imm( struct x86_function *p, struct x86_reg dst, int imm );
+
+
+/* Macro for sse_shufps() and sse2_pshufd():
+ */
+#define SHUF(_x,_y,_z,_w)       (((_x)<<0) | ((_y)<<2) | ((_z)<<4) | ((_w)<<6))
+#define SHUF_NOOP               RSW(0,1,2,3)
+#define GET_SHUF(swz, idx)      (((swz) >> ((idx)*2)) & 0x3)
+
+void mmx_emms( struct x86_function *p );
+void mmx_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void mmx_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void mmx_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void mmx_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
+void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movdqu( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movdqa( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movsd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movupd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movapd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
+void sse2_cvtps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_cvttps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_cvtdq2ps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_cvtsd2ss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_cvtpd2ps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
+void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_packsswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
+                  unsigned char shuf );
+void sse2_pshuflw( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
+                  unsigned char shuf );
+void sse2_pshufhw( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
+                  unsigned char shuf );
+void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
+void sse2_punpcklbw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_punpcklwd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_punpckldq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_punpcklqdq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
+void sse2_psllw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+void sse2_pslld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+void sse2_psllq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+
+void sse2_psrlw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+void sse2_psrld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+void sse2_psrlq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+
+void sse2_psraw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+void sse2_psrad_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+
+void sse2_por( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
+void sse2_pshuflw( struct x86_function *p, struct x86_reg dst, struct x86_reg src, uint8_t imm );
+void sse2_pshufhw( struct x86_function *p, struct x86_reg dst, struct x86_reg src, uint8_t imm );
+void sse2_pshufd( struct x86_function *p, struct x86_reg dst, struct x86_reg src, uint8_t imm );
+
+void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr);
+void sse_prefetch0( struct x86_function *p, struct x86_reg ptr);
+void sse_prefetch1( struct x86_function *p, struct x86_reg ptr);
+
+void sse_movntps( struct x86_function *p, struct x86_reg dst, struct x86_reg src);
+
+void sse_addps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_addss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_cvtps2pi( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_divss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_andnps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_andps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_cmpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src,
+                enum sse_cc cc );
+void sse_maxps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_maxss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_minps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movaps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movhlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movlhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movups( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_mulps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_mulss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_orps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_xorps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_subps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_rsqrtps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_rsqrtss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
+                 unsigned char shuf );
+void sse_unpckhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_unpcklps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_pmovmskb( struct x86_function *p, struct x86_reg dest, struct x86_reg src );
+void sse_movmskps( struct x86_function *p, struct x86_reg dst, struct x86_reg src);
+
+void x86_add( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_and( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_cmovcc( struct x86_function *p, struct x86_reg dst, struct x86_reg src, enum x86_cc cc );
+void x86_cmp( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_dec( struct x86_function *p, struct x86_reg reg );
+void x86_inc( struct x86_function *p, struct x86_reg reg );
+void x86_lea( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_mov( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x64_mov64( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_mov8( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_mov16( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_movzx8(struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_movzx16(struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_mov_imm(struct x86_function *p, struct x86_reg dst, int imm );
+void x86_mov8_imm(struct x86_function *p, struct x86_reg dst, uint8_t imm );
+void x86_mov16_imm(struct x86_function *p, struct x86_reg dst, uint16_t imm );
+void x86_mul( struct x86_function *p, struct x86_reg src );
+void x86_imul( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_or( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_pop( struct x86_function *p, struct x86_reg reg );
+void x86_push( struct x86_function *p, struct x86_reg reg );
+void x86_push_imm32( struct x86_function *p, int imm );
+void x86_ret( struct x86_function *p );
+void x86_retw( struct x86_function *p, unsigned short imm );
+void x86_sub( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_test( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_sahf( struct x86_function *p );
+void x86_div( struct x86_function *p, struct x86_reg src );
+void x86_bswap( struct x86_function *p, struct x86_reg src );
+void x86_shr_imm( struct x86_function *p, struct x86_reg reg, unsigned imm );
+void x86_sar_imm( struct x86_function *p, struct x86_reg reg, unsigned imm );
+void x86_shl_imm( struct x86_function *p, struct x86_reg reg, unsigned imm  );
+
+void x86_cdecl_caller_push_regs( struct x86_function *p );
+void x86_cdecl_caller_pop_regs( struct x86_function *p );
+
+void x87_assert_stack_empty( struct x86_function *p );
+
+void x87_f2xm1( struct x86_function *p );
+void x87_fabs( struct x86_function *p );
+void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_faddp( struct x86_function *p, struct x86_reg dst );
+void x87_fchs( struct x86_function *p );
+void x87_fclex( struct x86_function *p );
+void x87_fcmovb( struct x86_function *p, struct x86_reg src );
+void x87_fcmovbe( struct x86_function *p, struct x86_reg src );
+void x87_fcmove( struct x86_function *p, struct x86_reg src );
+void x87_fcmovnb( struct x86_function *p, struct x86_reg src );
+void x87_fcmovnbe( struct x86_function *p, struct x86_reg src );
+void x87_fcmovne( struct x86_function *p, struct x86_reg src );
+void x87_fcom( struct x86_function *p, struct x86_reg dst );
+void x87_fcomi( struct x86_function *p, struct x86_reg dst );
+void x87_fcomip( struct x86_function *p, struct x86_reg dst );
+void x87_fcomp( struct x86_function *p, struct x86_reg dst );
+void x87_fcos( struct x86_function *p );
+void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fdivp( struct x86_function *p, struct x86_reg dst );
+void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fdivrp( struct x86_function *p, struct x86_reg dst );
+void x87_fild( struct x86_function *p, struct x86_reg arg );
+void x87_fist( struct x86_function *p, struct x86_reg dst );
+void x87_fistp( struct x86_function *p, struct x86_reg dst );
+void x87_fld( struct x86_function *p, struct x86_reg arg );
+void x87_fld1( struct x86_function *p );
+void x87_fldcw( struct x86_function *p, struct x86_reg arg );
+void x87_fldl2e( struct x86_function *p );
+void x87_fldln2( struct x86_function *p );
+void x87_fldz( struct x86_function *p );
+void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fmulp( struct x86_function *p, struct x86_reg dst );
+void x87_fnclex( struct x86_function *p );
+void x87_fprndint( struct x86_function *p );
+void x87_fpop( struct x86_function *p );
+void x87_fscale( struct x86_function *p );
+void x87_fsin( struct x86_function *p );
+void x87_fsincos( struct x86_function *p );
+void x87_fsqrt( struct x86_function *p );
+void x87_fst( struct x86_function *p, struct x86_reg dst );
+void x87_fstp( struct x86_function *p, struct x86_reg dst );
+void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fsubp( struct x86_function *p, struct x86_reg dst );
+void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fsubrp( struct x86_function *p, struct x86_reg dst );
+void x87_ftst( struct x86_function *p );
+void x87_fxch( struct x86_function *p, struct x86_reg dst );
+void x87_fxtract( struct x86_function *p );
+void x87_fyl2x( struct x86_function *p );
+void x87_fyl2xp1( struct x86_function *p );
+void x87_fwait( struct x86_function *p );
+void x87_fnstcw( struct x86_function *p, struct x86_reg dst );
+void x87_fnstsw( struct x86_function *p, struct x86_reg dst );
+void x87_fucompp( struct x86_function *p );
+void x87_fucomp( struct x86_function *p, struct x86_reg arg );
+void x87_fucom( struct x86_function *p, struct x86_reg arg );
+
+
+
+/* Retrieve a reference to one of the function arguments, taking into
+ * account any push/pop activity.  Note - doesn't track explicit
+ * manipulation of ESP by other instructions.
+ */
+struct x86_reg x86_fn_arg( struct x86_function *p, unsigned arg );
+
+#endif
+#endif
diff --git a/drivers/video/Gallium/auxiliary/translate/translate.c b/drivers/video/Gallium/auxiliary/translate/translate.c
new file mode 100644
index 0000000000..73287b667d
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/translate/translate.c
@@ -0,0 +1,55 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "pipe/p_config.h"
+#include "pipe/p_state.h"
+#include "translate.h"
+
+struct translate *translate_create( const struct translate_key *key )
+{
+   struct translate *translate = NULL;
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   translate = translate_sse2_create( key );
+   if (translate)
+      return translate;
+#else
+   (void)translate;
+#endif
+
+   return translate_generic_create( key );
+}
+
+boolean translate_is_output_format_supported(enum pipe_format format)
+{
+   return translate_generic_is_output_format_supported(format);
+}
diff --git a/drivers/video/Gallium/auxiliary/translate/translate.h b/drivers/video/Gallium/auxiliary/translate/translate.h
new file mode 100644
index 0000000000..1132114de9
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/translate/translate.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright 2008 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+/**
+ * Vertex fetch/store/convert code.  This functionality is used in two places:
+ * 1. Vertex fetch/convert - to grab vertex data from incoming vertex
+ *    arrays and convert to format needed by vertex shaders.
+ * 2. Vertex store/emit - to convert simple float[][4] vertex attributes
+ *    (which is the organization used throughout the draw/prim pipeline) to
+ *    hardware-specific formats and emit into hardware vertex buffers.
+ *
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+#ifndef _TRANSLATE_H
+#define _TRANSLATE_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_format.h"
+#include "pipe/p_state.h"
+
+enum translate_element_type {
+   TRANSLATE_ELEMENT_NORMAL,
+   TRANSLATE_ELEMENT_INSTANCE_ID
+};
+
+struct translate_element 
+{
+   enum translate_element_type type;
+   enum pipe_format input_format;
+   enum pipe_format output_format;
+   unsigned input_buffer:8;
+   unsigned input_offset:24;
+   unsigned instance_divisor;
+   unsigned output_offset;
+};
+
+
+struct translate_key {
+   unsigned output_stride;
+   unsigned nr_elements;
+   struct translate_element element[PIPE_MAX_ATTRIBS + 1];
+};
+
+
+struct translate;
+
+
+typedef void (PIPE_CDECL *run_elts_func)(struct translate *,
+                                         const unsigned *elts,
+                                         unsigned count,
+                                         unsigned start_instance,
+                                         unsigned instance_id,
+                                         void *output_buffer);
+
+typedef void (PIPE_CDECL *run_elts16_func)(struct translate *,
+                                           const uint16_t *elts,
+                                           unsigned count,
+                                           unsigned start_instance,
+                                           unsigned instance_id,
+                                           void *output_buffer);
+
+typedef void (PIPE_CDECL *run_elts8_func)(struct translate *,
+                                          const uint8_t *elts,
+                                          unsigned count,
+                                          unsigned start_instance,
+                                          unsigned instance_id,
+                                          void *output_buffer);
+
+typedef void (PIPE_CDECL *run_func)(struct translate *,
+                                    unsigned start,
+                                    unsigned count,
+                                    unsigned start_instance,
+                                    unsigned instance_id,
+                                    void *output_buffer);
+
+struct translate {
+   struct translate_key key;
+
+   void (*release)( struct translate * );
+
+   void (*set_buffer)( struct translate *,
+		       unsigned i,
+		       const void *ptr,
+		       unsigned stride,
+		       unsigned max_index );
+
+   run_elts_func run_elts;
+   run_elts16_func run_elts16;
+   run_elts8_func run_elts8;
+   run_func run;
+};
+
+
+
+struct translate *translate_create( const struct translate_key *key );
+
+boolean translate_is_output_format_supported(enum pipe_format format);
+
+static INLINE int translate_keysize( const struct translate_key *key )
+{
+   return 2 * sizeof(int) + key->nr_elements * sizeof(struct translate_element);
+}
+
+static INLINE int translate_key_compare( const struct translate_key *a,
+                                         const struct translate_key *b )
+{
+   int keysize_a = translate_keysize(a);
+   int keysize_b = translate_keysize(b);
+
+   if (keysize_a != keysize_b) {
+      return keysize_a - keysize_b;
+   }
+   return memcmp(a, b, keysize_a);
+}
+
+
+static INLINE void translate_key_sanitize( struct translate_key *a )
+{
+   int keysize = translate_keysize(a);
+   char *ptr = (char *)a;
+   memset(ptr + keysize, 0, sizeof(*a) - keysize);
+}
+
+
+/*******************************************************************************
+ *  Private:
+ */
+struct translate *translate_sse2_create( const struct translate_key *key );
+
+struct translate *translate_generic_create( const struct translate_key *key );
+
+boolean translate_generic_is_output_format_supported(enum pipe_format format);
+
+#endif
diff --git a/drivers/video/Gallium/auxiliary/translate/translate_cache.c b/drivers/video/Gallium/auxiliary/translate/translate_cache.c
new file mode 100644
index 0000000000..3f1ecb630f
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/translate/translate_cache.c
@@ -0,0 +1,106 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_memory.h"
+#include "pipe/p_state.h"
+#include "translate.h"
+#include "translate_cache.h"
+
+#include "cso_cache/cso_cache.h"
+#include "cso_cache/cso_hash.h"
+
+struct translate_cache {
+   struct cso_hash *hash;
+};
+
+struct translate_cache * translate_cache_create( void )
+{
+   struct translate_cache *cache = MALLOC_STRUCT(translate_cache);
+   if (cache == NULL) {
+      return NULL;
+   }
+
+   cache->hash = cso_hash_create();
+   return cache;
+}
+
+
+static INLINE void delete_translates(struct translate_cache *cache)
+{
+   struct cso_hash *hash = cache->hash;
+   struct cso_hash_iter iter = cso_hash_first_node(hash);
+   while (!cso_hash_iter_is_null(iter)) {
+      struct translate *state = (struct translate*)cso_hash_iter_data(iter);
+      iter = cso_hash_iter_next(iter);
+      if (state) {
+         state->release(state);
+      }
+   }
+}
+
+void translate_cache_destroy(struct translate_cache *cache)
+{
+   delete_translates(cache);
+   cso_hash_delete(cache->hash);
+   FREE(cache);
+}
+
+
+static INLINE unsigned translate_hash_key_size(struct translate_key *key)
+{
+   unsigned size = sizeof(struct translate_key) -
+                   sizeof(struct translate_element) * (PIPE_MAX_ATTRIBS - key->nr_elements);
+   return size;
+}
+
+static INLINE unsigned create_key(struct translate_key *key)
+{
+   unsigned hash_key;
+   unsigned size = translate_hash_key_size(key);
+   /*debug_printf("key size = %d, (els = %d)\n",
+     size, key->nr_elements);*/
+   hash_key = cso_construct_key(key, size);
+   return hash_key;
+}
+
+struct translate * translate_cache_find(struct translate_cache *cache,
+                                        struct translate_key *key)
+{
+   unsigned hash_key = create_key(key);
+   struct translate *translate = (struct translate*)
+      cso_hash_find_data_from_template(cache->hash,
+                                       hash_key,
+                                       key, sizeof(*key));
+
+   if (!translate) {
+      /* create/insert */
+      translate = translate_create(key);
+      cso_hash_insert(cache->hash, hash_key, translate);
+   }
+
+   return translate;
+}
diff --git a/drivers/video/Gallium/auxiliary/translate/translate_cache.h b/drivers/video/Gallium/auxiliary/translate/translate_cache.h
new file mode 100644
index 0000000000..7dba871e57
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/translate/translate_cache.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2008 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef _TRANSLATE_CACHE_H
+#define _TRANSLATE_CACHE_H
+
+
+/*******************************************************************************
+ * Translate cache.
+ * Simply used to cache created translates. Avoids unecessary creation of
+ * translate's if one suitable for a given translate_key has already been
+ * created.
+ *
+ * Note: this functionality depends and requires the CSO module.
+ */
+struct translate_cache;
+
+struct translate_key;
+struct translate;
+
+struct translate_cache *translate_cache_create( void );
+void translate_cache_destroy(struct translate_cache *cache);
+
+/**
+ * Will try to find a translate structure matched by the given key.
+ * If such a structure doesn't exist in the cache the function
+ * will automatically create it, insert it in the cache and
+ * return the created version.
+ *
+ */
+struct translate *translate_cache_find(struct translate_cache *cache,
+                                       struct translate_key *key);
+
+#endif
diff --git a/drivers/video/Gallium/auxiliary/translate/translate_generic.c b/drivers/video/Gallium/auxiliary/translate/translate_generic.c
new file mode 100644
index 0000000000..96e35b0eb4
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/translate/translate_generic.c
@@ -0,0 +1,998 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_memory.h"
+#include "util/u_format.h"
+#include "util/u_half.h"
+#include "util/u_math.h"
+#include "pipe/p_state.h"
+#include "translate.h"
+
+
+#define DRAW_DBG 0
+
+typedef void (*fetch_func)(void *dst,
+                           const uint8_t *src,
+                           unsigned i, unsigned j);
+typedef void (*emit_func)(const void *attrib, void *ptr);
+
+
+
+struct translate_generic {
+   struct translate translate;
+
+   struct {
+      enum translate_element_type type;
+
+      fetch_func fetch;
+      unsigned buffer;
+      unsigned input_offset;
+      unsigned instance_divisor;
+
+      emit_func emit;
+      unsigned output_offset;
+      
+      const uint8_t *input_ptr;
+      unsigned input_stride;
+      unsigned max_index;
+
+      /* this value is set to -1 if this is a normal element with output_format != input_format:
+       * in this case, u_format is used to do a full conversion
+       *
+       * this value is set to the format size in bytes if output_format == input_format or for 32-bit instance ids:
+       * in this case, memcpy is used to copy this amount of bytes
+       */
+      int copy_size;
+
+   } attrib[PIPE_MAX_ATTRIBS];
+
+   unsigned nr_attrib;
+};
+
+
+static struct translate_generic *translate_generic( struct translate *translate )
+{
+   return (struct translate_generic *)translate;
+}
+
+/**
+ * Fetch a dword[4] vertex attribute from memory, doing format/type
+ * conversion as needed.
+ *
+ * This is probably needed/dupliocated elsewhere, eg format
+ * conversion, texture sampling etc.
+ */
+#define ATTRIB( NAME, SZ, SRCTYPE, DSTTYPE, TO )        \
+static void						\
+emit_##NAME(const void *attrib, void *ptr)		\
+{  \
+   unsigned i;						\
+   SRCTYPE *in = (SRCTYPE *)attrib;                     \
+   DSTTYPE *out = (DSTTYPE *)ptr;			\
+							\
+   for (i = 0; i < SZ; i++) {				\
+      out[i] = TO(in[i]);				\
+   }							\
+}
+
+
+#define TO_64_FLOAT(x)   ((double) x)
+#define TO_32_FLOAT(x)   (x)
+#define TO_16_FLOAT(x)   util_float_to_half(x)
+
+#define TO_8_USCALED(x)  ((unsigned char) x)
+#define TO_16_USCALED(x) ((unsigned short) x)
+#define TO_32_USCALED(x) ((unsigned int) x)
+
+#define TO_8_SSCALED(x)  ((char) x)
+#define TO_16_SSCALED(x) ((short) x)
+#define TO_32_SSCALED(x) ((int) x)
+
+#define TO_8_UNORM(x)    ((unsigned char) (x * 255.0f))
+#define TO_16_UNORM(x)   ((unsigned short) (x * 65535.0f))
+#define TO_32_UNORM(x)   ((unsigned int) (x * 4294967295.0f))
+
+#define TO_8_SNORM(x)    ((char) (x * 127.0f))
+#define TO_16_SNORM(x)   ((short) (x * 32767.0f))
+#define TO_32_SNORM(x)   ((int) (x * 2147483647.0f))
+
+#define TO_32_FIXED(x)   ((int) (x * 65536.0f))
+
+#define TO_INT(x)        (x)
+
+
+ATTRIB( R64G64B64A64_FLOAT,   4, float, double, TO_64_FLOAT )
+ATTRIB( R64G64B64_FLOAT,      3, float, double, TO_64_FLOAT )
+ATTRIB( R64G64_FLOAT,         2, float, double, TO_64_FLOAT )
+ATTRIB( R64_FLOAT,            1, float, double, TO_64_FLOAT )
+
+ATTRIB( R32G32B32A32_FLOAT,   4, float, float, TO_32_FLOAT )
+ATTRIB( R32G32B32_FLOAT,      3, float, float, TO_32_FLOAT )
+ATTRIB( R32G32_FLOAT,         2, float, float, TO_32_FLOAT )
+ATTRIB( R32_FLOAT,            1, float, float, TO_32_FLOAT )
+
+ATTRIB( R16G16B16A16_FLOAT,   4, float, ushort, TO_16_FLOAT )
+ATTRIB( R16G16B16_FLOAT,      3, float, ushort, TO_16_FLOAT )
+ATTRIB( R16G16_FLOAT,         2, float, ushort, TO_16_FLOAT )
+ATTRIB( R16_FLOAT,            1, float, ushort, TO_16_FLOAT )
+
+ATTRIB( R32G32B32A32_USCALED, 4, float, unsigned, TO_32_USCALED )
+ATTRIB( R32G32B32_USCALED,    3, float, unsigned, TO_32_USCALED )
+ATTRIB( R32G32_USCALED,       2, float, unsigned, TO_32_USCALED )
+ATTRIB( R32_USCALED,          1, float, unsigned, TO_32_USCALED )
+
+ATTRIB( R32G32B32A32_SSCALED, 4, float, int, TO_32_SSCALED )
+ATTRIB( R32G32B32_SSCALED,    3, float, int, TO_32_SSCALED )
+ATTRIB( R32G32_SSCALED,       2, float, int, TO_32_SSCALED )
+ATTRIB( R32_SSCALED,          1, float, int, TO_32_SSCALED )
+
+ATTRIB( R32G32B32A32_UNORM, 4, float, unsigned, TO_32_UNORM )
+ATTRIB( R32G32B32_UNORM,    3, float, unsigned, TO_32_UNORM )
+ATTRIB( R32G32_UNORM,       2, float, unsigned, TO_32_UNORM )
+ATTRIB( R32_UNORM,          1, float, unsigned, TO_32_UNORM )
+
+ATTRIB( R32G32B32A32_SNORM, 4, float, int, TO_32_SNORM )
+ATTRIB( R32G32B32_SNORM,    3, float, int, TO_32_SNORM )
+ATTRIB( R32G32_SNORM,       2, float, int, TO_32_SNORM )
+ATTRIB( R32_SNORM,          1, float, int, TO_32_SNORM )
+
+ATTRIB( R16G16B16A16_USCALED, 4, float, ushort, TO_16_USCALED )
+ATTRIB( R16G16B16_USCALED,    3, float, ushort, TO_16_USCALED )
+ATTRIB( R16G16_USCALED,       2, float, ushort, TO_16_USCALED )
+ATTRIB( R16_USCALED,          1, float, ushort, TO_16_USCALED )
+
+ATTRIB( R16G16B16A16_SSCALED, 4, float, short, TO_16_SSCALED )
+ATTRIB( R16G16B16_SSCALED,    3, float, short, TO_16_SSCALED )
+ATTRIB( R16G16_SSCALED,       2, float, short, TO_16_SSCALED )
+ATTRIB( R16_SSCALED,          1, float, short, TO_16_SSCALED )
+
+ATTRIB( R16G16B16A16_UNORM, 4, float, ushort, TO_16_UNORM )
+ATTRIB( R16G16B16_UNORM,    3, float, ushort, TO_16_UNORM )
+ATTRIB( R16G16_UNORM,       2, float, ushort, TO_16_UNORM )
+ATTRIB( R16_UNORM,          1, float, ushort, TO_16_UNORM )
+
+ATTRIB( R16G16B16A16_SNORM, 4, float, short, TO_16_SNORM )
+ATTRIB( R16G16B16_SNORM,    3, float, short, TO_16_SNORM )
+ATTRIB( R16G16_SNORM,       2, float, short, TO_16_SNORM )
+ATTRIB( R16_SNORM,          1, float, short, TO_16_SNORM )
+
+ATTRIB( R8G8B8A8_USCALED,   4, float, ubyte, TO_8_USCALED )
+ATTRIB( R8G8B8_USCALED,     3, float, ubyte, TO_8_USCALED )
+ATTRIB( R8G8_USCALED,       2, float, ubyte, TO_8_USCALED )
+ATTRIB( R8_USCALED,         1, float, ubyte, TO_8_USCALED )
+
+ATTRIB( R8G8B8A8_SSCALED,  4, float, char, TO_8_SSCALED )
+ATTRIB( R8G8B8_SSCALED,    3, float, char, TO_8_SSCALED )
+ATTRIB( R8G8_SSCALED,      2, float, char, TO_8_SSCALED )
+ATTRIB( R8_SSCALED,        1, float, char, TO_8_SSCALED )
+
+ATTRIB( R8G8B8A8_UNORM,  4, float, ubyte, TO_8_UNORM )
+ATTRIB( R8G8B8_UNORM,    3, float, ubyte, TO_8_UNORM )
+ATTRIB( R8G8_UNORM,      2, float, ubyte, TO_8_UNORM )
+ATTRIB( R8_UNORM,        1, float, ubyte, TO_8_UNORM )
+
+ATTRIB( R8G8B8A8_SNORM,  4, float, char, TO_8_SNORM )
+ATTRIB( R8G8B8_SNORM,    3, float, char, TO_8_SNORM )
+ATTRIB( R8G8_SNORM,      2, float, char, TO_8_SNORM )
+ATTRIB( R8_SNORM,        1, float, char, TO_8_SNORM )
+
+ATTRIB( R32G32B32A32_UINT, 4, uint32_t, unsigned, TO_INT )
+ATTRIB( R32G32B32_UINT,    3, uint32_t, unsigned, TO_INT )
+ATTRIB( R32G32_UINT,       2, uint32_t, unsigned, TO_INT )
+ATTRIB( R32_UINT,          1, uint32_t, unsigned, TO_INT )
+
+ATTRIB( R16G16B16A16_UINT, 4, uint32_t, ushort, TO_INT )
+ATTRIB( R16G16B16_UINT,    3, uint32_t, ushort, TO_INT )
+ATTRIB( R16G16_UINT,       2, uint32_t, ushort, TO_INT )
+ATTRIB( R16_UINT,          1, uint32_t, ushort, TO_INT )
+
+ATTRIB( R8G8B8A8_UINT,   4, uint32_t, ubyte, TO_INT )
+ATTRIB( R8G8B8_UINT,     3, uint32_t, ubyte, TO_INT )
+ATTRIB( R8G8_UINT,       2, uint32_t, ubyte, TO_INT )
+ATTRIB( R8_UINT,         1, uint32_t, ubyte, TO_INT )
+
+ATTRIB( R32G32B32A32_SINT, 4, int32_t, int, TO_INT )
+ATTRIB( R32G32B32_SINT,    3, int32_t, int, TO_INT )
+ATTRIB( R32G32_SINT,       2, int32_t, int, TO_INT )
+ATTRIB( R32_SINT,          1, int32_t, int, TO_INT )
+
+ATTRIB( R16G16B16A16_SINT, 4, int32_t, short, TO_INT )
+ATTRIB( R16G16B16_SINT,    3, int32_t, short, TO_INT )
+ATTRIB( R16G16_SINT,       2, int32_t, short, TO_INT )
+ATTRIB( R16_SINT,          1, int32_t, short, TO_INT )
+
+ATTRIB( R8G8B8A8_SINT,   4, int32_t, char, TO_INT )
+ATTRIB( R8G8B8_SINT,     3, int32_t, char, TO_INT )
+ATTRIB( R8G8_SINT,       2, int32_t, char, TO_INT )
+ATTRIB( R8_SINT,         1, int32_t, char, TO_INT )
+
+static void
+emit_A8R8G8B8_UNORM( const void *attrib, void *ptr)
+{
+   float *in = (float *)attrib;
+   ubyte *out = (ubyte *)ptr;
+   out[0] = TO_8_UNORM(in[3]);
+   out[1] = TO_8_UNORM(in[0]);
+   out[2] = TO_8_UNORM(in[1]);
+   out[3] = TO_8_UNORM(in[2]);
+}
+
+static void
+emit_B8G8R8A8_UNORM( const void *attrib, void *ptr)
+{
+   float *in = (float *)attrib;
+   ubyte *out = (ubyte *)ptr;
+   out[2] = TO_8_UNORM(in[0]);
+   out[1] = TO_8_UNORM(in[1]);
+   out[0] = TO_8_UNORM(in[2]);
+   out[3] = TO_8_UNORM(in[3]);
+}
+
+static void
+emit_B10G10R10A2_UNORM( const void *attrib, void *ptr )
+{
+   float *src = (float *)ptr;
+   uint32_t value = 0;
+   value |= ((uint32_t)(CLAMP(src[2], 0, 1) * 0x3ff)) & 0x3ff;
+   value |= (((uint32_t)(CLAMP(src[1], 0, 1) * 0x3ff)) & 0x3ff) << 10;
+   value |= (((uint32_t)(CLAMP(src[0], 0, 1) * 0x3ff)) & 0x3ff) << 20;
+   value |= ((uint32_t)(CLAMP(src[3], 0, 1) * 0x3)) << 30;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+   value = util_bswap32(value);
+#endif
+   *(uint32_t *)attrib = value;
+}
+
+static void
+emit_B10G10R10A2_USCALED( const void *attrib, void *ptr )
+{
+   float *src = (float *)ptr;
+   uint32_t value = 0;
+   value |= ((uint32_t)CLAMP(src[2], 0, 1023)) & 0x3ff;
+   value |= (((uint32_t)CLAMP(src[1], 0, 1023)) & 0x3ff) << 10;
+   value |= (((uint32_t)CLAMP(src[0], 0, 1023)) & 0x3ff) << 20;
+   value |= ((uint32_t)CLAMP(src[3], 0, 3)) << 30;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+   value = util_bswap32(value);
+#endif
+   *(uint32_t *)attrib = value;
+}
+
+static void
+emit_B10G10R10A2_SNORM( const void *attrib, void *ptr )
+{
+   float *src = (float *)ptr;
+   uint32_t value = 0;
+   value |= (uint32_t)(((uint32_t)(CLAMP(src[2], -1, 1) * 0x1ff)) & 0x3ff) ;
+   value |= (uint32_t)((((uint32_t)(CLAMP(src[1], -1, 1) * 0x1ff)) & 0x3ff) << 10) ;
+   value |= (uint32_t)((((uint32_t)(CLAMP(src[0], -1, 1) * 0x1ff)) & 0x3ff) << 20) ;
+   value |= (uint32_t)(((uint32_t)(CLAMP(src[3], -1, 1) * 0x1)) << 30) ;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+   value = util_bswap32(value);
+#endif
+   *(uint32_t *)attrib = value;
+}
+
+static void
+emit_B10G10R10A2_SSCALED( const void *attrib, void *ptr )
+{
+   float *src = (float *)ptr;
+   uint32_t value = 0;
+   value |= (uint32_t)(((uint32_t)CLAMP(src[2], -512, 511)) & 0x3ff) ;
+   value |= (uint32_t)((((uint32_t)CLAMP(src[1], -512, 511)) & 0x3ff) << 10) ;
+   value |= (uint32_t)((((uint32_t)CLAMP(src[0], -512, 511)) & 0x3ff) << 20) ;
+   value |= (uint32_t)(((uint32_t)CLAMP(src[3], -2, 1)) << 30) ;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+   value = util_bswap32(value);
+#endif
+   *(uint32_t *)attrib = value;
+}
+
+static void
+emit_R10G10B10A2_UNORM( const void *attrib, void *ptr )
+{
+   float *src = (float *)ptr;
+   uint32_t value = 0;
+   value |= ((uint32_t)(CLAMP(src[0], 0, 1) * 0x3ff)) & 0x3ff;
+   value |= (((uint32_t)(CLAMP(src[1], 0, 1) * 0x3ff)) & 0x3ff) << 10;
+   value |= (((uint32_t)(CLAMP(src[2], 0, 1) * 0x3ff)) & 0x3ff) << 20;
+   value |= ((uint32_t)(CLAMP(src[3], 0, 1) * 0x3)) << 30;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+   value = util_bswap32(value);
+#endif
+   *(uint32_t *)attrib = value;
+}
+
+static void
+emit_R10G10B10A2_USCALED( const void *attrib, void *ptr )
+{
+   float *src = (float *)ptr;
+   uint32_t value = 0;
+   value |= ((uint32_t)CLAMP(src[0], 0, 1023)) & 0x3ff;
+   value |= (((uint32_t)CLAMP(src[1], 0, 1023)) & 0x3ff) << 10;
+   value |= (((uint32_t)CLAMP(src[2], 0, 1023)) & 0x3ff) << 20;
+   value |= ((uint32_t)CLAMP(src[3], 0, 3)) << 30;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+   value = util_bswap32(value);
+#endif
+   *(uint32_t *)attrib = value;
+}
+
+static void
+emit_R10G10B10A2_SNORM( const void *attrib, void *ptr )
+{
+   float *src = (float *)ptr;
+   uint32_t value = 0;
+   value |= (uint32_t)(((uint32_t)(CLAMP(src[0], -1, 1) * 0x1ff)) & 0x3ff) ;
+   value |= (uint32_t)((((uint32_t)(CLAMP(src[1], -1, 1) * 0x1ff)) & 0x3ff) << 10) ;
+   value |= (uint32_t)((((uint32_t)(CLAMP(src[2], -1, 1) * 0x1ff)) & 0x3ff) << 20) ;
+   value |= (uint32_t)(((uint32_t)(CLAMP(src[3], -1, 1) * 0x1)) << 30) ;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+   value = util_bswap32(value);
+#endif
+   *(uint32_t *)attrib = value;
+}
+
+static void
+emit_R10G10B10A2_SSCALED( const void *attrib, void *ptr)
+{
+   float *src = (float *)ptr;
+   uint32_t value = 0;
+   value |= (uint32_t)(((uint32_t)CLAMP(src[0], -512, 511)) & 0x3ff) ;
+   value |= (uint32_t)((((uint32_t)CLAMP(src[1], -512, 511)) & 0x3ff) << 10) ;
+   value |= (uint32_t)((((uint32_t)CLAMP(src[2], -512, 511)) & 0x3ff) << 20) ;
+   value |= (uint32_t)(((uint32_t)CLAMP(src[3], -2, 1)) << 30) ;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+   value = util_bswap32(value);
+#endif
+   *(uint32_t *)attrib = value;
+}
+
+static void 
+emit_NULL( const void *attrib, void *ptr )
+{
+   /* do nothing is the only sensible option */
+}
+
+static emit_func get_emit_func( enum pipe_format format )
+{
+   switch (format) {
+   case PIPE_FORMAT_R64_FLOAT:
+      return &emit_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return &emit_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return &emit_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return &emit_R64G64B64A64_FLOAT;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      return &emit_R32_FLOAT;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return &emit_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return &emit_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return &emit_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R16_FLOAT:
+      return &emit_R16_FLOAT;
+   case PIPE_FORMAT_R16G16_FLOAT:
+      return &emit_R16G16_FLOAT;
+   case PIPE_FORMAT_R16G16B16_FLOAT:
+      return &emit_R16G16B16_FLOAT;
+   case PIPE_FORMAT_R16G16B16A16_FLOAT:
+      return &emit_R16G16B16A16_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return &emit_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return &emit_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return &emit_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return &emit_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return &emit_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return &emit_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return &emit_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return &emit_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return &emit_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return &emit_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return &emit_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return &emit_R32G32B32A32_SNORM;
+
+   case PIPE_FORMAT_R32_SSCALED:
+      return &emit_R32_SSCALED;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return &emit_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return &emit_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return &emit_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return &emit_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return &emit_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return &emit_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return &emit_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return &emit_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return &emit_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return &emit_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return &emit_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return &emit_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return &emit_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return &emit_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return &emit_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return &emit_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return &emit_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return &emit_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return &emit_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return &emit_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return &emit_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return &emit_R8G8B8_UNORM;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return &emit_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return &emit_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return &emit_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return &emit_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return &emit_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return &emit_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return &emit_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return &emit_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return &emit_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return &emit_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return &emit_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return &emit_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return &emit_R8G8B8A8_SSCALED;
+
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      return &emit_B8G8R8A8_UNORM;
+
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return &emit_A8R8G8B8_UNORM;
+
+   case PIPE_FORMAT_R32_UINT:
+      return &emit_R32_UINT;
+   case PIPE_FORMAT_R32G32_UINT:
+      return &emit_R32G32_UINT;
+   case PIPE_FORMAT_R32G32B32_UINT:
+      return &emit_R32G32B32_UINT;
+   case PIPE_FORMAT_R32G32B32A32_UINT:
+      return &emit_R32G32B32A32_UINT;
+
+   case PIPE_FORMAT_R16_UINT:
+      return &emit_R16_UINT;
+   case PIPE_FORMAT_R16G16_UINT:
+      return &emit_R16G16_UINT;
+   case PIPE_FORMAT_R16G16B16_UINT:
+      return &emit_R16G16B16_UINT;
+   case PIPE_FORMAT_R16G16B16A16_UINT:
+      return &emit_R16G16B16A16_UINT;
+
+   case PIPE_FORMAT_R8_UINT:
+      return &emit_R8_UINT;
+   case PIPE_FORMAT_R8G8_UINT:
+      return &emit_R8G8_UINT;
+   case PIPE_FORMAT_R8G8B8_UINT:
+      return &emit_R8G8B8_UINT;
+   case PIPE_FORMAT_R8G8B8A8_UINT:
+      return &emit_R8G8B8A8_UINT;
+
+   case PIPE_FORMAT_R32_SINT:
+      return &emit_R32_SINT;
+   case PIPE_FORMAT_R32G32_SINT:
+      return &emit_R32G32_SINT;
+   case PIPE_FORMAT_R32G32B32_SINT:
+      return &emit_R32G32B32_SINT;
+   case PIPE_FORMAT_R32G32B32A32_SINT:
+      return &emit_R32G32B32A32_SINT;
+
+   case PIPE_FORMAT_R16_SINT:
+      return &emit_R16_SINT;
+   case PIPE_FORMAT_R16G16_SINT:
+      return &emit_R16G16_SINT;
+   case PIPE_FORMAT_R16G16B16_SINT:
+      return &emit_R16G16B16_SINT;
+   case PIPE_FORMAT_R16G16B16A16_SINT:
+      return &emit_R16G16B16A16_SINT;
+
+   case PIPE_FORMAT_R8_SINT:
+      return &emit_R8_SINT;
+   case PIPE_FORMAT_R8G8_SINT:
+      return &emit_R8G8_SINT;
+   case PIPE_FORMAT_R8G8B8_SINT:
+      return &emit_R8G8B8_SINT;
+   case PIPE_FORMAT_R8G8B8A8_SINT:
+      return &emit_R8G8B8A8_SINT;
+
+   case PIPE_FORMAT_B10G10R10A2_UNORM:
+      return &emit_B10G10R10A2_UNORM;
+   case PIPE_FORMAT_B10G10R10A2_USCALED:
+      return &emit_B10G10R10A2_USCALED;
+   case PIPE_FORMAT_B10G10R10A2_SNORM:
+      return &emit_B10G10R10A2_SNORM;
+   case PIPE_FORMAT_B10G10R10A2_SSCALED:
+      return &emit_B10G10R10A2_SSCALED;
+
+   case PIPE_FORMAT_R10G10B10A2_UNORM:
+      return &emit_R10G10B10A2_UNORM;
+   case PIPE_FORMAT_R10G10B10A2_USCALED:
+      return &emit_R10G10B10A2_USCALED;
+   case PIPE_FORMAT_R10G10B10A2_SNORM:
+      return &emit_R10G10B10A2_SNORM;
+   case PIPE_FORMAT_R10G10B10A2_SSCALED:
+      return &emit_R10G10B10A2_SSCALED;
+
+   default:
+      assert(0); 
+      return &emit_NULL;
+   }
+}
+
+static ALWAYS_INLINE void PIPE_CDECL generic_run_one( struct translate_generic *tg,
+                                         unsigned elt,
+                                         unsigned start_instance,
+                                         unsigned instance_id,
+                                         void *vert )
+{
+   unsigned nr_attrs = tg->nr_attrib;
+   unsigned attr;
+
+   for (attr = 0; attr < nr_attrs; attr++) {
+      float data[4];
+      uint8_t *dst = (uint8_t *)vert + tg->attrib[attr].output_offset;
+
+      if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) {
+         const uint8_t *src;
+         unsigned index;
+         int copy_size;
+
+         if (tg->attrib[attr].instance_divisor) {
+            index = start_instance;
+            index += (instance_id - start_instance) /
+               tg->attrib[attr].instance_divisor;
+            /* XXX we need to clamp the index here too, but to a
+             * per-array max value, not the draw->pt.max_index value
+             * that's being given to us via translate->set_buffer().
+             */
+         }
+         else {
+            index = elt;
+            /* clamp to avoid going out of bounds */
+            index = MIN2(index, tg->attrib[attr].max_index);
+         }
+
+         src = tg->attrib[attr].input_ptr +
+               tg->attrib[attr].input_stride * index;
+
+         copy_size = tg->attrib[attr].copy_size;
+         if(likely(copy_size >= 0))
+            memcpy(dst, src, copy_size);
+         else
+         {
+            tg->attrib[attr].fetch( data, src, 0, 0 );
+
+            if (0)
+               debug_printf("Fetch linear attr %d  from %p  stride %d  index %d: "
+                         " %f, %f, %f, %f \n",
+                         attr,
+                         tg->attrib[attr].input_ptr,
+                         tg->attrib[attr].input_stride,
+                         index,
+                         data[0], data[1],data[2], data[3]);
+
+            tg->attrib[attr].emit( data, dst );
+         }
+      } else {
+         if(likely(tg->attrib[attr].copy_size >= 0))
+            memcpy(data, &instance_id, 4);
+         else
+         {
+            data[0] = (float)instance_id;
+            tg->attrib[attr].emit( data, dst );
+         }
+      }
+   }
+}
+
+/**
+ * Fetch vertex attributes for 'count' vertices.
+ */
+static void PIPE_CDECL generic_run_elts( struct translate *translate,
+                                         const unsigned *elts,
+                                         unsigned count,
+                                         unsigned start_instance,
+                                         unsigned instance_id,
+                                         void *output_buffer )
+{
+   struct translate_generic *tg = translate_generic(translate);
+   char *vert = output_buffer;
+   unsigned i;
+
+   for (i = 0; i < count; i++) {
+      generic_run_one(tg, *elts++, start_instance, instance_id, vert);
+      vert += tg->translate.key.output_stride;
+   }
+}
+
+static void PIPE_CDECL generic_run_elts16( struct translate *translate,
+                                         const uint16_t *elts,
+                                         unsigned count,
+                                         unsigned start_instance,
+                                         unsigned instance_id,
+                                         void *output_buffer )
+{
+   struct translate_generic *tg = translate_generic(translate);
+   char *vert = output_buffer;
+   unsigned i;
+
+   for (i = 0; i < count; i++) {
+      generic_run_one(tg, *elts++, start_instance, instance_id, vert);
+      vert += tg->translate.key.output_stride;
+   }
+}
+
+static void PIPE_CDECL generic_run_elts8( struct translate *translate,
+                                         const uint8_t *elts,
+                                         unsigned count,
+                                         unsigned start_instance, 
+                                         unsigned instance_id,
+                                         void *output_buffer )
+{
+   struct translate_generic *tg = translate_generic(translate);
+   char *vert = output_buffer;
+   unsigned i;
+
+   for (i = 0; i < count; i++) {
+      generic_run_one(tg, *elts++, start_instance, instance_id, vert);
+      vert += tg->translate.key.output_stride;
+   }
+}
+
+static void PIPE_CDECL generic_run( struct translate *translate,
+                                    unsigned start,
+                                    unsigned count,
+                                    unsigned start_instance,
+                                    unsigned instance_id,
+                                    void *output_buffer )
+{
+   struct translate_generic *tg = translate_generic(translate);
+   char *vert = output_buffer;
+   unsigned i;
+
+   for (i = 0; i < count; i++) {
+      generic_run_one(tg, start + i, start_instance, instance_id, vert);
+      vert += tg->translate.key.output_stride;
+   }
+}
+
+
+			       
+static void generic_set_buffer( struct translate *translate,
+				unsigned buf,
+				const void *ptr,
+				unsigned stride,
+				unsigned max_index )
+{
+   struct translate_generic *tg = translate_generic(translate);
+   unsigned i;
+
+   for (i = 0; i < tg->nr_attrib; i++) {
+      if (tg->attrib[i].buffer == buf) {
+	 tg->attrib[i].input_ptr = ((const uint8_t *)ptr +
+				    tg->attrib[i].input_offset);
+	 tg->attrib[i].input_stride = stride;
+         tg->attrib[i].max_index = max_index;
+      }
+   }
+}
+
+
+static void generic_release( struct translate *translate )
+{
+   /* Refcount?
+    */
+   FREE(translate);
+}
+
+static boolean
+is_legal_int_format_combo( const struct util_format_description *src,
+                           const struct util_format_description *dst )
+{
+   unsigned i;
+   unsigned nr = MIN2(src->nr_channels, dst->nr_channels);
+
+   for (i = 0; i < nr; i++) {
+      /* The signs must match. */
+      if (src->channel[i].type != dst->channel[i].type) {
+         return FALSE;
+      }
+
+      /* Integers must not lose precision at any point in the pipeline. */
+      if (src->channel[i].size > dst->channel[i].size) {
+         return FALSE;
+      }
+   }
+   return TRUE;
+}
+
+struct translate *translate_generic_create( const struct translate_key *key )
+{
+   struct translate_generic *tg = CALLOC_STRUCT(translate_generic);
+   unsigned i;
+
+   if (tg == NULL)
+      return NULL;
+
+   tg->translate.key = *key;
+   tg->translate.release = generic_release;
+   tg->translate.set_buffer = generic_set_buffer;
+   tg->translate.run_elts = generic_run_elts;
+   tg->translate.run_elts16 = generic_run_elts16;
+   tg->translate.run_elts8 = generic_run_elts8;
+   tg->translate.run = generic_run;
+
+   for (i = 0; i < key->nr_elements; i++) {
+      const struct util_format_description *format_desc =
+            util_format_description(key->element[i].input_format);
+
+      assert(format_desc);
+
+      tg->attrib[i].type = key->element[i].type;
+
+      if (format_desc->channel[0].pure_integer) {
+         const struct util_format_description *out_format_desc =
+               util_format_description(key->element[i].output_format);
+
+         if (!is_legal_int_format_combo(format_desc, out_format_desc)) {
+            FREE(tg);
+            return NULL;
+         }
+
+         if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
+            assert(format_desc->fetch_rgba_sint);
+            tg->attrib[i].fetch = (fetch_func)format_desc->fetch_rgba_sint;
+         } else {
+            assert(format_desc->fetch_rgba_uint);
+            tg->attrib[i].fetch = (fetch_func)format_desc->fetch_rgba_uint;
+         }
+      } else {
+         assert(format_desc->fetch_rgba_float);
+         tg->attrib[i].fetch = (fetch_func)format_desc->fetch_rgba_float;
+      }
+
+      tg->attrib[i].buffer = key->element[i].input_buffer;
+      tg->attrib[i].input_offset = key->element[i].input_offset;
+      tg->attrib[i].instance_divisor = key->element[i].instance_divisor;
+
+      tg->attrib[i].output_offset = key->element[i].output_offset;
+
+      tg->attrib[i].copy_size = -1;
+      if (tg->attrib[i].type == TRANSLATE_ELEMENT_INSTANCE_ID)
+      {
+            if(key->element[i].output_format == PIPE_FORMAT_R32_USCALED
+                  || key->element[i].output_format == PIPE_FORMAT_R32_SSCALED)
+               tg->attrib[i].copy_size = 4;
+      }
+      else
+      {
+         if(key->element[i].input_format == key->element[i].output_format
+               && format_desc->block.width == 1
+               && format_desc->block.height == 1
+               && !(format_desc->block.bits & 7))
+            tg->attrib[i].copy_size = format_desc->block.bits >> 3;
+      }
+
+      if(tg->attrib[i].copy_size < 0)
+	      tg->attrib[i].emit = get_emit_func(key->element[i].output_format);
+      else
+	      tg->attrib[i].emit  = NULL;
+   }
+
+   tg->nr_attrib = key->nr_elements;
+
+
+   return &tg->translate;
+}
+
+boolean translate_generic_is_output_format_supported(enum pipe_format format)
+{
+   switch(format)
+   {
+   case PIPE_FORMAT_R64G64B64A64_FLOAT: return TRUE;
+   case PIPE_FORMAT_R64G64B64_FLOAT: return TRUE;
+   case PIPE_FORMAT_R64G64_FLOAT: return TRUE;
+   case PIPE_FORMAT_R64_FLOAT: return TRUE;
+
+   case PIPE_FORMAT_R32G32B32A32_FLOAT: return TRUE;
+   case PIPE_FORMAT_R32G32B32_FLOAT: return TRUE;
+   case PIPE_FORMAT_R32G32_FLOAT: return TRUE;
+   case PIPE_FORMAT_R32_FLOAT: return TRUE;
+
+   case PIPE_FORMAT_R16G16B16A16_FLOAT: return TRUE;
+   case PIPE_FORMAT_R16G16B16_FLOAT: return TRUE;
+   case PIPE_FORMAT_R16G16_FLOAT: return TRUE;
+   case PIPE_FORMAT_R16_FLOAT: return TRUE;
+
+   case PIPE_FORMAT_R32G32B32A32_USCALED: return TRUE;
+   case PIPE_FORMAT_R32G32B32_USCALED: return TRUE;
+   case PIPE_FORMAT_R32G32_USCALED: return TRUE;
+   case PIPE_FORMAT_R32_USCALED: return TRUE;
+
+   case PIPE_FORMAT_R32G32B32A32_SSCALED: return TRUE;
+   case PIPE_FORMAT_R32G32B32_SSCALED: return TRUE;
+   case PIPE_FORMAT_R32G32_SSCALED: return TRUE;
+   case PIPE_FORMAT_R32_SSCALED: return TRUE;
+
+   case PIPE_FORMAT_R32G32B32A32_UNORM: return TRUE;
+   case PIPE_FORMAT_R32G32B32_UNORM: return TRUE;
+   case PIPE_FORMAT_R32G32_UNORM: return TRUE;
+   case PIPE_FORMAT_R32_UNORM: return TRUE;
+
+   case PIPE_FORMAT_R32G32B32A32_SNORM: return TRUE;
+   case PIPE_FORMAT_R32G32B32_SNORM: return TRUE;
+   case PIPE_FORMAT_R32G32_SNORM: return TRUE;
+   case PIPE_FORMAT_R32_SNORM: return TRUE;
+
+   case PIPE_FORMAT_R16G16B16A16_USCALED: return TRUE;
+   case PIPE_FORMAT_R16G16B16_USCALED: return TRUE;
+   case PIPE_FORMAT_R16G16_USCALED: return TRUE;
+   case PIPE_FORMAT_R16_USCALED: return TRUE;
+
+   case PIPE_FORMAT_R16G16B16A16_SSCALED: return TRUE;
+   case PIPE_FORMAT_R16G16B16_SSCALED: return TRUE;
+   case PIPE_FORMAT_R16G16_SSCALED: return TRUE;
+   case PIPE_FORMAT_R16_SSCALED: return TRUE;
+
+   case PIPE_FORMAT_R16G16B16A16_UNORM: return TRUE;
+   case PIPE_FORMAT_R16G16B16_UNORM: return TRUE;
+   case PIPE_FORMAT_R16G16_UNORM: return TRUE;
+   case PIPE_FORMAT_R16_UNORM: return TRUE;
+
+   case PIPE_FORMAT_R16G16B16A16_SNORM: return TRUE;
+   case PIPE_FORMAT_R16G16B16_SNORM: return TRUE;
+   case PIPE_FORMAT_R16G16_SNORM: return TRUE;
+   case PIPE_FORMAT_R16_SNORM: return TRUE;
+
+   case PIPE_FORMAT_R8G8B8A8_USCALED: return TRUE;
+   case PIPE_FORMAT_R8G8B8_USCALED: return TRUE;
+   case PIPE_FORMAT_R8G8_USCALED: return TRUE;
+   case PIPE_FORMAT_R8_USCALED: return TRUE;
+
+   case PIPE_FORMAT_R8G8B8A8_SSCALED: return TRUE;
+   case PIPE_FORMAT_R8G8B8_SSCALED: return TRUE;
+   case PIPE_FORMAT_R8G8_SSCALED: return TRUE;
+   case PIPE_FORMAT_R8_SSCALED: return TRUE;
+
+   case PIPE_FORMAT_R8G8B8A8_UNORM: return TRUE;
+   case PIPE_FORMAT_R8G8B8_UNORM: return TRUE;
+   case PIPE_FORMAT_R8G8_UNORM: return TRUE;
+   case PIPE_FORMAT_R8_UNORM: return TRUE;
+
+   case PIPE_FORMAT_R8G8B8A8_SNORM: return TRUE;
+   case PIPE_FORMAT_R8G8B8_SNORM: return TRUE;
+   case PIPE_FORMAT_R8G8_SNORM: return TRUE;
+   case PIPE_FORMAT_R8_SNORM: return TRUE;
+
+   case PIPE_FORMAT_A8R8G8B8_UNORM: return TRUE;
+   case PIPE_FORMAT_B8G8R8A8_UNORM: return TRUE;
+
+   case PIPE_FORMAT_R32G32B32A32_UINT: return TRUE;
+   case PIPE_FORMAT_R32G32B32_UINT: return TRUE;
+   case PIPE_FORMAT_R32G32_UINT: return TRUE;
+   case PIPE_FORMAT_R32_UINT: return TRUE;
+
+   case PIPE_FORMAT_R16G16B16A16_UINT: return TRUE;
+   case PIPE_FORMAT_R16G16B16_UINT: return TRUE;
+   case PIPE_FORMAT_R16G16_UINT: return TRUE;
+   case PIPE_FORMAT_R16_UINT: return TRUE;
+
+   case PIPE_FORMAT_R8G8B8A8_UINT: return TRUE;
+   case PIPE_FORMAT_R8G8B8_UINT: return TRUE;
+   case PIPE_FORMAT_R8G8_UINT: return TRUE;
+   case PIPE_FORMAT_R8_UINT: return TRUE;
+
+   case PIPE_FORMAT_R32G32B32A32_SINT: return TRUE;
+   case PIPE_FORMAT_R32G32B32_SINT: return TRUE;
+   case PIPE_FORMAT_R32G32_SINT: return TRUE;
+   case PIPE_FORMAT_R32_SINT: return TRUE;
+
+   case PIPE_FORMAT_R16G16B16A16_SINT: return TRUE;
+   case PIPE_FORMAT_R16G16B16_SINT: return TRUE;
+   case PIPE_FORMAT_R16G16_SINT: return TRUE;
+   case PIPE_FORMAT_R16_SINT: return TRUE;
+
+   case PIPE_FORMAT_R8G8B8A8_SINT: return TRUE;
+   case PIPE_FORMAT_R8G8B8_SINT: return TRUE;
+   case PIPE_FORMAT_R8G8_SINT: return TRUE;
+   case PIPE_FORMAT_R8_SINT: return TRUE;
+
+   case PIPE_FORMAT_B10G10R10A2_UNORM: return TRUE;
+   case PIPE_FORMAT_B10G10R10A2_USCALED: return TRUE;
+   case PIPE_FORMAT_B10G10R10A2_SNORM: return TRUE;
+   case PIPE_FORMAT_B10G10R10A2_SSCALED: return TRUE;
+
+   case PIPE_FORMAT_R10G10B10A2_UNORM: return TRUE;
+   case PIPE_FORMAT_R10G10B10A2_USCALED: return TRUE;
+   case PIPE_FORMAT_R10G10B10A2_SNORM: return TRUE;
+   case PIPE_FORMAT_R10G10B10A2_SSCALED: return TRUE;
+
+   default: return FALSE;
+   }
+}
diff --git a/drivers/video/Gallium/auxiliary/translate/translate_sse.c b/drivers/video/Gallium/auxiliary/translate/translate_sse.c
new file mode 100644
index 0000000000..a4f7b243c1
--- /dev/null
+++ b/drivers/video/Gallium/auxiliary/translate/translate_sse.c
@@ -0,0 +1,1573 @@
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_config.h"
+#include "pipe/p_compiler.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_format.h"
+
+#include "translate.h"
+
+
+#if (defined(PIPE_ARCH_X86) || (defined(PIPE_ARCH_X86_64) && !defined(__MINGW32__))) && !defined(PIPE_SUBSYSTEM_EMBEDDED)
+
+#include "rtasm/rtasm_cpu.h"
+#include "rtasm/rtasm_x86sse.h"
+
+
+#define X    0
+#define Y    1
+#define Z    2
+#define W    3
+
+
+struct translate_buffer {
+   const void *base_ptr;
+   uintptr_t stride;
+   unsigned max_index;
+};
+
+struct translate_buffer_variant {
+   unsigned buffer_index;
+   unsigned instance_divisor;
+   void *ptr;                    /* updated either per vertex or per instance */
+};
+
+
+#define ELEMENT_BUFFER_INSTANCE_ID  1001
+
+#define NUM_CONSTS 7
+
+enum
+{
+   CONST_IDENTITY,
+   CONST_INV_127,
+   CONST_INV_255,
+   CONST_INV_32767,
+   CONST_INV_65535,
+   CONST_INV_2147483647,
+   CONST_255
+};
+
+#define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
+static float consts[NUM_CONSTS][4] = {
+      {0, 0, 0, 1},
+      C(1.0 / 127.0),
+      C(1.0 / 255.0),
+      C(1.0 / 32767.0),
+      C(1.0 / 65535.0),
+      C(1.0 / 2147483647.0),
+      C(255.0)
+};
+#undef C
+
+struct translate_sse {
+   struct translate translate;
+
+   struct x86_function linear_func;
+   struct x86_function elt_func;
+   struct x86_function elt16_func;
+   struct x86_function elt8_func;
+   struct x86_function *func;
+
+   PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];
+   int8_t reg_to_const[16];
+   int8_t const_to_reg[NUM_CONSTS];
+
+   struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
+   unsigned nr_buffers;
+
+   /* Multiple buffer variants can map to a single buffer. */
+   struct translate_buffer_variant buffer_variant[PIPE_MAX_ATTRIBS];
+   unsigned nr_buffer_variants;
+
+   /* Multiple elements can map to a single buffer variant. */
+   unsigned element_to_buffer_variant[PIPE_MAX_ATTRIBS];
+
+   boolean use_instancing;
+   unsigned instance_id;
+   unsigned start_instance;
+
+   /* these are actually known values, but putting them in a struct
+    * like this is helpful to keep them in sync across the file.
+    */
+   struct x86_reg tmp_EAX;
+   struct x86_reg tmp2_EDX;
+   struct x86_reg src_ECX;
+   struct x86_reg idx_ESI;     /* either start+i or &elt[i] */
+   struct x86_reg machine_EDI;
+   struct x86_reg outbuf_EBX;
+   struct x86_reg count_EBP;    /* decrements to zero */
+};
+
+static int get_offset( const void *a, const void *b )
+{
+   return (const char *)b - (const char *)a;
+}
+
+static struct x86_reg get_const( struct translate_sse *p, unsigned id)
+{
+   struct x86_reg reg;
+   unsigned i;
+
+   if(p->const_to_reg[id] >= 0)
+      return x86_make_reg(file_XMM, p->const_to_reg[id]);
+
+   for(i = 2; i < 8; ++i)
+   {
+      if(p->reg_to_const[i] < 0)
+         break;
+   }
+
+   /* TODO: be smarter here */
+   if(i == 8)
+      --i;
+
+   reg = x86_make_reg(file_XMM, i);
+
+   if(p->reg_to_const[i] >= 0)
+      p->const_to_reg[p->reg_to_const[i]] = -1;
+
+   p->reg_to_const[i] = id;
+   p->const_to_reg[id] = i;
+
+   /* TODO: this should happen outside the loop, if possible */
+   sse_movaps(p->func, reg,
+         x86_make_disp(p->machine_EDI,
+               get_offset(p, &p->consts[id][0])));
+
+   return reg;
+}
+
+/* load the data in a SSE2 register, padding with zeros */
+static boolean emit_load_sse2( struct translate_sse *p,
+				       struct x86_reg data,
+				       struct x86_reg src,
+				       unsigned size)
+{
+   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
+   struct x86_reg tmp = p->tmp_EAX;
+   switch(size)
+   {
+   case 1:
+      x86_movzx8(p->func, tmp, src);
+      sse2_movd(p->func, data, tmp);
+      break;
+   case 2:
+      x86_movzx16(p->func, tmp, src);
+      sse2_movd(p->func, data, tmp);
+      break;
+   case 3:
+      x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
+      x86_shl_imm(p->func, tmp, 16);
+      x86_mov16(p->func, tmp, src);
+      sse2_movd(p->func, data, tmp);
+      break;
+   case 4:
+      sse2_movd(p->func, data, src);
+      break;
+   case 6:
+      sse2_movd(p->func, data, src);
+      x86_movzx16(p->func, tmp, x86_make_disp(src, 4));
+      sse2_movd(p->func, tmpXMM, tmp);
+      sse2_punpckldq(p->func, data, tmpXMM);
+      break;
+   case 8:
+      sse2_movq(p->func, data, src);
+      break;
+   case 12:
+      sse2_movq(p->func, data, src);
+      sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));
+      sse2_punpcklqdq(p->func, data, tmpXMM);
+      break;
+   case 16:
+      sse2_movdqu(p->func, data, src);
+      break;
+   default:
+      return FALSE;
+   }
+   return TRUE;
+}
+
+/* this value can be passed for the out_chans argument */
+#define CHANNELS_0001 5
+
+/* this function will load #chans float values, and will
+ * pad the register with zeroes at least up to out_chans.
+ *
+ * If out_chans is set to CHANNELS_0001, then the fourth
+ * value will be padded with 1. Only pass this value if
+ * chans < 4 or results are undefined.
+ */
+static void emit_load_float32( struct translate_sse *p,
+                                       struct x86_reg data,
+                                       struct x86_reg arg0,
+                                       unsigned out_chans,
+                                       unsigned chans)
+{
+   switch(chans)
+   {
+   case 1:
+      /* a 0 0 0
+       * a 0 0 1
+       */
+      sse_movss(p->func, data, arg0);
+      if(out_chans == CHANNELS_0001)
+         sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
+      break;
+   case 2:
+      /* 0 0 0 1
+       * a b 0 1
+       */
+      if(out_chans == CHANNELS_0001)
+         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
+      else if(out_chans > 2)
+         sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
+      sse_movlps(p->func, data, arg0);
+      break;
+   case 3:
+      /* Have to jump through some hoops:
+       *
+       * c 0 0 0
+       * c 0 0 1 if out_chans == CHANNELS_0001
+       * 0 0 c 0/1
+       * a b c 0/1
+       */
+      sse_movss(p->func, data, x86_make_disp(arg0, 8));
+      if(out_chans == CHANNELS_0001)
+         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X,Y,Z,W) );
+      sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
+      sse_movlps(p->func, data, arg0);
+      break;
+   case 4:
+      sse_movups(p->func, data, arg0);
+      break;
+   }
+}
+
+/* this function behaves like emit_load_float32, but loads
+   64-bit floating point numbers, converting them to 32-bit
+  ones */
+static void emit_load_float64to32( struct translate_sse *p,
+                                       struct x86_reg data,
+                                       struct x86_reg arg0,
+                                       unsigned out_chans,
+                                       unsigned chans)
+{
+   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
+   switch(chans)
+   {
+   case 1:
+      sse2_movsd(p->func, data, arg0);
+      if(out_chans > 1)
+         sse2_cvtpd2ps(p->func, data, data);
+      else
+         sse2_cvtsd2ss(p->func, data, data);
+      if(out_chans == CHANNELS_0001)
+         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W)  );
+      break;
+   case 2:
+      sse2_movupd(p->func, data, arg0);
+      sse2_cvtpd2ps(p->func, data, data);
+      if(out_chans == CHANNELS_0001)
+         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
+      else if(out_chans > 2)
+         sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
+       break;
+   case 3:
+      sse2_movupd(p->func, data, arg0);
+      sse2_cvtpd2ps(p->func, data, data);
+      sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
+      if(out_chans > 3)
+         sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
+      else
+         sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
+      sse_movlhps(p->func, data, tmpXMM);
+      if(out_chans == CHANNELS_0001)
+         sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
+      break;
+   case 4:
+      sse2_movupd(p->func, data, arg0);
+      sse2_cvtpd2ps(p->func, data, data);
+      sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));
+      sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
+      sse_movlhps(p->func, data, tmpXMM);
+      break;
+   }
+}
+
+static void emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src_gpr,  struct x86_reg src_xmm)
+{
+   if(x86_target(p->func) != X86_32)
+      x64_mov64(p->func, dst_gpr, src_gpr);
+   else
+   {
+      /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
+      if(x86_target_caps(p->func) & X86_SSE2)
+         sse2_movq(p->func, dst_xmm, src_xmm);
+      else
+         sse_movlps(p->func, dst_xmm, src_xmm);
+   }
+}
+
+static void emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src)
+{
+   emit_mov64(p, dst_gpr, dst_xmm, src, src);
+}
+
+static void emit_store64(struct translate_sse *p, struct x86_reg dst, struct x86_reg src_gpr, struct x86_reg src_xmm)
+{
+   emit_mov64(p, dst, dst, src_gpr, src_xmm);
+}
+
+static void emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
+{
+   if(x86_target_caps(p->func) & X86_SSE2)
+      sse2_movdqu(p->func, dst, src);
+   else
+      sse_movups(p->func, dst, src);
+}
+
+/* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
+ * but may or may not be good on older processors
+ * TODO: may perhaps want to use non-temporal stores here if possible
+ */
+static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, unsigned size)
+{
+   struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
+   struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
+   struct x86_reg dataGPR = p->tmp_EAX;
+   struct x86_reg dataGPR2 = p->tmp2_EDX;
+
+   if(size < 8)
+   {
+      switch (size)
+      {
+      case 1:
+         x86_mov8(p->func, dataGPR, src);
+         x86_mov8(p->func, dst, dataGPR);
+         break;
+      case 2:
+         x86_mov16(p->func, dataGPR, src);
+         x86_mov16(p->func, dst, dataGPR);
+         break;
+      case 3:
+         x86_mov16(p->func, dataGPR, src);
+         x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));
+         x86_mov16(p->func, dst, dataGPR);
+         x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);
+         break;
+      case 4:
+         x86_mov(p->func, dataGPR, src);
+         x86_mov(p->func, dst, dataGPR);
+         break;
+      case 6:
+         x86_mov(p->func, dataGPR, src);
+         x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));
+         x86_mov(p->func, dst, dataGPR);
+         x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);
+         break;
+      }
+   }
+   else if(!(x86_target_caps(p->func) & X86_SSE))
+   {
+      unsigned i = 0;
+      assert((size & 3) == 0);
+      for(i = 0; i < size; i += 4)
+      {
+         x86_mov(p->func, dataGPR, x86_make_disp(src, i));
+         x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
+      }
+   }
+   else
+   {
+      switch(size)
+      {
+      case 8:
+         emit_load64(p, dataGPR, dataXMM, src);
+         emit_store64(p, dst, dataGPR, dataXMM);
+         break;
+      case 12:
+         emit_load64(p, dataGPR2, dataXMM, src);
+         x86_mov(p->func, dataGPR, x86_make_disp(src, 8));
+         emit_store64(p, dst, dataGPR2, dataXMM);
+         x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);
+         break;
+      case 16:
+         emit_mov128(p, dataXMM, src);
+         emit_mov128(p, dst, dataXMM);
+         break;
+      case 24:
+         emit_mov128(p, dataXMM, src);
+         emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));
+         emit_mov128(p, dst, dataXMM);
+         emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);
+         break;
+      case 32:
+         emit_mov128(p, dataXMM, src);
+         emit_mov128(p, dataXMM2, x86_make_disp(src, 16));
+         emit_mov128(p, dst, dataXMM);
+         emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);
+         break;
+      default:
+         assert(0);
+      }
+   }
+}
+
+static boolean translate_attr_convert( struct translate_sse *p,
+                               const struct translate_element *a,
+                               struct x86_reg src,
+                               struct x86_reg dst)
+
+{
+   const struct util_format_description* input_desc = util_format_description(a->input_format);
+   const struct util_format_description* output_desc = util_format_description(a->output_format);
+   unsigned i;
+   boolean id_swizzle = TRUE;
+   unsigned swizzle[4] = {UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE};
+   unsigned needed_chans = 0;
+   unsigned imms[2] = {0, 0x3f800000};
+
+   if(a->output_format == PIPE_FORMAT_NONE || a->input_format == PIPE_FORMAT_NONE)
+      return FALSE;
+
+   if(input_desc->channel[0].size & 7)
+      return FALSE;
+
+   if(input_desc->colorspace != output_desc->colorspace)
+      return FALSE;
+
+   for(i = 1; i < input_desc->nr_channels; ++i)
+   {
+      if(memcmp(&input_desc->channel[i], &input_desc->channel[0], sizeof(input_desc->channel[0])))
+         return FALSE;
+   }
+
+   for(i = 1; i < output_desc->nr_channels; ++i)
+   {
+      if(memcmp(&output_desc->channel[i], &output_desc->channel[0], sizeof(output_desc->channel[0])))
+         return FALSE;
+   }
+
+   for(i = 0; i < output_desc->nr_channels; ++i)
+   {
+      if(output_desc->swizzle[i] < 4)
+         swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
+   }
+
+   if((x86_target_caps(p->func) & X86_SSE) && (0
+         || a->output_format == PIPE_FORMAT_R32_FLOAT
+         || a->output_format == PIPE_FORMAT_R32G32_FLOAT
+         || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
+         || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT))
+   {
+      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
+
+      for(i = 0; i < output_desc->nr_channels; ++i)
+      {
+         if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
+            swizzle[i] = i;
+      }
+
+      for(i = 0; i < output_desc->nr_channels; ++i)
+      {
+         if(swizzle[i] < 4)
+            needed_chans = MAX2(needed_chans, swizzle[i] + 1);
+         if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
+            id_swizzle = FALSE;
+      }
+
+      if(needed_chans > 0)
+      {
+         switch(input_desc->channel[0].type)
+         {
+         case UTIL_FORMAT_TYPE_UNSIGNED:
+            if(!(x86_target_caps(p->func) & X86_SSE2))
+               return FALSE;
+            emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
+
+            /* TODO: add support for SSE4.1 pmovzx */
+            switch(input_desc->channel[0].size)
+            {
+            case 8:
+               /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */
+               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
+               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
+               break;
+            case 16:
+               sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
+               break;
+            case 32: /* we lose precision here */
+               sse2_psrld_imm(p->func, dataXMM, 1);
+               break;
+            default:
+               return FALSE;
+            }
+            sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
+            if(input_desc->channel[0].normalized)
+            {
+               struct x86_reg factor;
+               switch(input_desc->channel[0].size)
+               {
+               case 8:
+                  factor = get_const(p, CONST_INV_255);
+                  break;
+               case 16:
+                  factor = get_const(p, CONST_INV_65535);
+                  break;
+               case 32:
+                  factor = get_const(p, CONST_INV_2147483647);
+                  break;
+               default:
+                  assert(0);
+                  factor.disp = 0;
+                  factor.file = 0;
+                  factor.idx = 0;
+                  factor.mod = 0;
+                  break;
+               }
+               sse_mulps(p->func, dataXMM, factor);
+            }
+            else if(input_desc->channel[0].size == 32)
+               sse_addps(p->func, dataXMM, dataXMM); /* compensate for the bit we threw away to fit u32 into s32 */
+            break;
+         case UTIL_FORMAT_TYPE_SIGNED:
+            if(!(x86_target_caps(p->func) & X86_SSE2))
+               return FALSE;
+            emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
+
+            /* TODO: add support for SSE4.1 pmovsx */
+            switch(input_desc->channel[0].size)
+            {
+            case 8:
+               sse2_punpcklbw(p->func, dataXMM, dataXMM);
+               sse2_punpcklbw(p->func, dataXMM, dataXMM);
+               sse2_psrad_imm(p->func, dataXMM, 24);
+               break;
+            case 16:
+               sse2_punpcklwd(p->func, dataXMM, dataXMM);
+               sse2_psrad_imm(p->func, dataXMM, 16);
+               break;
+            case 32: /* we lose precision here */
+               break;
+            default:
+               return FALSE;
+            }
+            sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
+            if(input_desc->channel[0].normalized)
+            {
+               struct x86_reg factor;
+               switch(input_desc->channel[0].size)
+               {
+               case 8:
+                  factor = get_const(p, CONST_INV_127);
+                  break;
+               case 16:
+                  factor = get_const(p, CONST_INV_32767);
+                  break;
+               case 32:
+                  factor = get_const(p, CONST_INV_2147483647);
+                  break;
+               default:
+                  assert(0);
+                  factor.disp = 0;
+                  factor.file = 0;
+                  factor.idx = 0;
+                  factor.mod = 0;
+                  break;
+               }
+               sse_mulps(p->func, dataXMM, factor);
+            }
+            break;
+
+            break;
+         case UTIL_FORMAT_TYPE_FLOAT:
+            if(input_desc->channel[0].size != 32 && input_desc->channel[0].size != 64)
+               return FALSE;
+            if(swizzle[3] == UTIL_FORMAT_SWIZZLE_1 && input_desc->nr_channels <= 3)
+            {
+               swizzle[3] = UTIL_FORMAT_SWIZZLE_W;
+               needed_chans = CHANNELS_0001;
+            }
+            switch(input_desc->channel[0].size)
+            {
+            case 32:
+               emit_load_float32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
+               break;
+            case 64: /* we lose precision here */
+               if(!(x86_target_caps(p->func) & X86_SSE2))
+                  return FALSE;
+               emit_load_float64to32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
+               break;
+            default:
+               return FALSE;
+            }
+            break;
+         default:
+            return FALSE;
+         }
+
+         if(!id_swizzle)
+            sse_shufps(p->func, dataXMM, dataXMM, SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]) );
+      }
+
+      if(output_desc->nr_channels >= 4
+            && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
+            && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
+            && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
+            && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
+            )
+         sse_movups(p->func, dst, dataXMM);
+      else
+      {
+         if(output_desc->nr_channels >= 2
+               && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
+               && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
+            sse_movlps(p->func, dst, dataXMM);
+         else
+         {
+            if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
+               sse_movss(p->func, dst, dataXMM);
+            else
+               x86_mov_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
+
+            if(output_desc->nr_channels >= 2)
+            {
+               if(swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
+               {
+                  sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
+                  sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
+               }
+               else
+                  x86_mov_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
+            }
+         }
+
+         if(output_desc->nr_channels >= 3)
+         {
+            if(output_desc->nr_channels >= 4
+                  && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
+                  && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
+               sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
+            else
+            {
+               if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
+               {
+                  sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
+                  sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
+               }
+               else
+                  x86_mov_imm(p->func, x86_make_disp(dst, 8), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
+
+               if(output_desc->nr_channels >= 4)
+               {
+                  if(swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
+                  {
+                     sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
+                     sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
+                  }
+                  else
+                     x86_mov_imm(p->func, x86_make_disp(dst, 12), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
+               }
+            }
+         }
+      }
+      return TRUE;
+   }
+   else if((x86_target_caps(p->func) & X86_SSE2) && input_desc->channel[0].size == 8 && output_desc->channel[0].size == 16
+         && output_desc->channel[0].normalized == input_desc->channel[0].normalized
+         && (0
+               || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
+               || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
+               || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
+               ))
+   {
+      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
+      struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
+      struct x86_reg tmp = p->tmp_EAX;
+      unsigned imms[2] = {0, 1};
+
+      for(i = 0; i < output_desc->nr_channels; ++i)
+      {
+         if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
+            swizzle[i] = i;
+      }
+
+      for(i = 0; i < output_desc->nr_channels; ++i)
+      {
+         if(swizzle[i] < 4)
+            needed_chans = MAX2(needed_chans, swizzle[i] + 1);
+         if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
+            id_swizzle = FALSE;
+      }
+
+      if(needed_chans > 0)
+      {
+         emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
+
+         switch(input_desc->channel[0].type)
+         {
+         case UTIL_FORMAT_TYPE_UNSIGNED:
+            if(input_desc->channel[0].normalized)
+            {
+               sse2_punpcklbw(p->func, dataXMM, dataXMM);
+               if(output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
+        	       sse2_psrlw_imm(p->func, dataXMM, 1);
+            }
+            else
+               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
+            break;
+         case UTIL_FORMAT_TYPE_SIGNED:
+            if(input_desc->channel[0].normalized)
+            {
+               sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY));
+               sse2_punpcklbw(p->func, tmpXMM, dataXMM);
+               sse2_psllw_imm(p->func, dataXMM, 9);
+               sse2_psrlw_imm(p->func, dataXMM, 8);
+               sse2_por(p->func, tmpXMM, dataXMM);
+               sse2_psrlw_imm(p->func, dataXMM, 7);
+               sse2_por(p->func, tmpXMM, dataXMM);
+               {
+                  struct x86_reg t = dataXMM;
+                  dataXMM = tmpXMM;
+                  tmpXMM = t;
+               }
+            }
+            else
+            {
+               sse2_punpcklbw(p->func, dataXMM, dataXMM);
+               sse2_psraw_imm(p->func, dataXMM, 8);
+            }
+            break;
+         default:
+            assert(0);
+         }
+
+         if(output_desc->channel[0].normalized)
+            imms[1] = (output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
+
+         if(!id_swizzle)
+            sse2_pshuflw(p->func, dataXMM, dataXMM, (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
+      }
+
+      if(output_desc->nr_channels >= 4
+            && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
+            && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
+            && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
+            && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
+            )
+         sse2_movq(p->func, dst, dataXMM);
+      else
+      {
+         if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
+         {
+            if(output_desc->nr_channels >= 2 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
+               sse2_movd(p->func, dst, dataXMM);
+            else
+            {
+               sse2_movd(p->func, tmp, dataXMM);
+               x86_mov16(p->func, dst, tmp);
+               if(output_desc->nr_channels >= 2)
+                  x86_mov16_imm(p->func, x86_make_disp(dst, 2), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
+            }
+         }
+         else
+         {
+            if(output_desc->nr_channels >= 2 && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0)
+               x86_mov_imm(p->func, dst, (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
+            else
+            {
+               x86_mov16_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
+               if(output_desc->nr_channels >= 2)
+               {
+                  sse2_movd(p->func, tmp, dataXMM);
+                  x86_shr_imm(p->func, tmp, 16);
+                  x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
+               }
+            }
+         }
+
+         if(output_desc->nr_channels >= 3)
+         {
+            if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
+            {
+               if(output_desc->nr_channels >= 4 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
+               {
+                  sse2_psrlq_imm(p->func, dataXMM, 32);
+                  sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
+               }
+               else
+               {
+                  sse2_psrlq_imm(p->func, dataXMM, 32);
+                  sse2_movd(p->func, tmp, dataXMM);
+                  x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
+                  if(output_desc->nr_channels >= 4)
+                  {
+                     x86_mov16_imm(p->func, x86_make_disp(dst, 6), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
+                  }
+               }
+            }
+            else
+            {
+               if(output_desc->nr_channels >= 4 && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0)
+                  x86_mov_imm(p->func, x86_make_disp(dst, 4), (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
+               else
+               {
+                  x86_mov16_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
+
+                  if(output_desc->nr_channels >= 4)
+                  {
+                     sse2_psrlq_imm(p->func, dataXMM, 48);
+                     sse2_movd(p->func, tmp, dataXMM);
+                     x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
+                  }
+               }
+            }
+         }
+      }
+      return TRUE;
+   }
+   else if(!memcmp(&output_desc->channel[0], &input_desc->channel[0], sizeof(output_desc->channel[0])))
+   {
+      struct x86_reg tmp = p->tmp_EAX;
+      unsigned i;
+      if(input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 && output_desc->nr_channels == 4
+                     && swizzle[0] == UTIL_FORMAT_SWIZZLE_W
+                     && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z
+                     && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y
+                     && swizzle[3] == UTIL_FORMAT_SWIZZLE_X)
+      {
+         /* TODO: support movbe */
+         x86_mov(p->func, tmp, src);
+         x86_bswap(p->func, tmp);
+         x86_mov(p->func, dst, tmp);
+         return TRUE;
+      }
+
+      for(i = 0; i < output_desc->nr_channels; ++i)
+      {
+         switch(output_desc->channel[0].size)
+         {
+         case 8:
+            if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
+            {
+               unsigned v = 0;
+               if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
+               {
+                  switch(output_desc->channel[0].type)
+                  {
+                  case UTIL_FORMAT_TYPE_UNSIGNED:
+                     v = output_desc->channel[0].normalized ? 0xff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_SIGNED:
+                     v = output_desc->channel[0].normalized ? 0x7f : 1;
+                     break;
+                  default:
+                     return FALSE;
+                  }
+               }
+               x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
+            }
+            else
+            {
+               x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
+               x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
+            }
+            break;
+         case 16:
+            if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
+            {
+               unsigned v = 0;
+               if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
+               {
+                  switch(output_desc->channel[1].type)
+                  {
+                  case UTIL_FORMAT_TYPE_UNSIGNED:
+                     v = output_desc->channel[1].normalized ? 0xffff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_SIGNED:
+                     v = output_desc->channel[1].normalized ? 0x7fff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_FLOAT:
+                     v = 0x3c00;
+                     break;
+                  default:
+                     return FALSE;
+                  }
+               }
+               x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
+            }
+            else if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0)
+               x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
+            else
+            {
+               x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
+               x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
+            }
+            break;
+         case 32:
+            if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
+            {
+               unsigned v = 0;
+               if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
+               {
+                  switch(output_desc->channel[1].type)
+                  {
+                  case UTIL_FORMAT_TYPE_UNSIGNED:
+                     v = output_desc->channel[1].normalized ? 0xffffffff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_SIGNED:
+                     v = output_desc->channel[1].normalized ? 0x7fffffff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_FLOAT:
+                     v = 0x3f800000;
+                     break;
+                  default:
+                     return FALSE;
+                  }
+               }
+               x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
+            }
+            else
+            {
+               x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
+               x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
+            }
+            break;
+         case 64:
+            if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
+            {
+               unsigned l = 0;
+               unsigned h = 0;
+               if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
+               {
+                  switch(output_desc->channel[1].type)
+                  {
+                  case UTIL_FORMAT_TYPE_UNSIGNED:
+                     h = output_desc->channel[1].normalized ? 0xffffffff : 0;
+                     l = output_desc->channel[1].normalized ? 0xffffffff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_SIGNED:
+                     h = output_desc->channel[1].normalized ? 0x7fffffff : 0;
+                     l = output_desc->channel[1].normalized ? 0xffffffff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_FLOAT:
+                     h = 0x3ff00000;
+                     l = 0;
+                     break;
+                  default:
+                     return FALSE;
+                  }
+               }
+               x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
+               x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
+            }
+            else
+            {
+               if(x86_target_caps(p->func) & X86_SSE)
+               {
+                  struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
+                  emit_load64(p, tmp, tmpXMM, x86_make_disp(src, swizzle[i] * 8));
+                  emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
+               }
+               else
+               {
+                  x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
+                  x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
+                  x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8 + 4));
+                  x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
+               }
+            }
+            break;
+         default:
+            return FALSE;
+         }
+      }
+      return TRUE;
+   }
+   /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
+   else if((x86_target_caps(p->func) & X86_SSE2) &&
+         a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && (0
+               || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM
+               || a->output_format == PIPE_FORMAT_R8G8B8A8_UNORM
+         ))
+   {
+      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
+
+      /* load */
+      sse_movups(p->func, dataXMM, src);
+
+      if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM)
+         sse_shufps(p->func, dataXMM, dataXMM, SHUF(2,1,0,3));
+
+      /* scale by 255.0 */
+      sse_mulps(p->func, dataXMM, get_const(p, CONST_255));
+
+      /* pack and emit */
+      sse2_cvtps2dq(p->func, dataXMM, dataXMM);
+      sse2_packssdw(p->func, dataXMM, dataXMM);
+      sse2_packuswb(p->func, dataXMM, dataXMM);
+      sse2_movd(p->func, dst, dataXMM);
+
+      return TRUE;
+   }
+
+   return FALSE;
+}
+
+static boolean translate_attr( struct translate_sse *p,
+			       const struct translate_element *a,
+			       struct x86_reg src,
+			       struct x86_reg dst)
+{
+   if(a->input_format == a->output_format)
+   {
+      emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
+      return TRUE;
+   }
+
+   return translate_attr_convert(p, a, src, dst);
+}
+
+static boolean init_inputs( struct translate_sse *p,
+                            unsigned index_size )
+{
+   unsigned i;
+   struct x86_reg instance_id = x86_make_disp(p->machine_EDI,
+                                              get_offset(p, &p->instance_id));
+   struct x86_reg start_instance = x86_make_disp(p->machine_EDI,
+                                                 get_offset(p, &p->start_instance));
+
+   for (i = 0; i < p->nr_buffer_variants; i++) {
+      struct translate_buffer_variant *variant = &p->buffer_variant[i];
+      struct translate_buffer *buffer = &p->buffer[variant->buffer_index];
+
+      if (!index_size || variant->instance_divisor) {
+         struct x86_reg buf_max_index = x86_make_disp(p->machine_EDI,
+                                                     get_offset(p, &buffer->max_index));
+         struct x86_reg buf_stride   = x86_make_disp(p->machine_EDI,
+                                                     get_offset(p, &buffer->stride));
+         struct x86_reg buf_ptr      = x86_make_disp(p->machine_EDI,
+                                                     get_offset(p, &variant->ptr));
+         struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDI,
+                                                     get_offset(p, &buffer->base_ptr));
+         struct x86_reg elt = p->idx_ESI;
+         struct x86_reg tmp_EAX = p->tmp_EAX;
+
+         /* Calculate pointer to first attrib:
+          *   base_ptr + stride * index, where index depends on instance divisor
+          */
+         if (variant->instance_divisor) {
+            /* Start with instance = instance_id
+             * which is true if divisor is 1.
+             */
+            x86_mov(p->func, tmp_EAX, instance_id);
+
+            if (variant->instance_divisor != 1) {
+               struct x86_reg tmp_EDX = p->tmp2_EDX;
+               struct x86_reg tmp_ECX = p->src_ECX;
+
+               /* instance_num = instance_id - start_instance */
+               x86_mov(p->func, tmp_EDX, start_instance);
+               x86_sub(p->func, tmp_EAX, tmp_EDX);
+
+               /* TODO: Add x86_shr() to rtasm and use it whenever
+                *       instance divisor is power of two.
+                */
+               x86_xor(p->func, tmp_EDX, tmp_EDX);
+               x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor);
+               x86_div(p->func, tmp_ECX);    /* EAX = EDX:EAX / ECX */
+
+               /* instance = (instance_id - start_instance) / divisor + 
+                *             start_instance 
+                */
+               x86_mov(p->func, tmp_EDX, start_instance);
+               x86_add(p->func, tmp_EAX, tmp_EDX);
+            }
+
+            /* XXX we need to clamp the index here too, but to a
+             * per-array max value, not the draw->pt.max_index value
+             * that's being given to us via translate->set_buffer().
+             */
+         } else {
+            x86_mov(p->func, tmp_EAX, elt);
+
+            /* Clamp to max_index
+             */
+            x86_cmp(p->func, tmp_EAX, buf_max_index);
+            x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE);
+         }
+
+         x86_imul(p->func, tmp_EAX, buf_stride);
+         x64_rexw(p->func);
+         x86_add(p->func, tmp_EAX, buf_base_ptr);
+
+         x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
+
+         /* In the linear case, keep the buffer pointer instead of the
+          * index number.
+          */
+         if (!index_size && p->nr_buffer_variants == 1)
+         {
+            x64_rexw(p->func);
+            x86_mov(p->func, elt, tmp_EAX);
+         }
+         else
+         {
+            x64_rexw(p->func);
+            x86_mov(p->func, buf_ptr, tmp_EAX);
+         }
+      }
+   }
+
+   return TRUE;
+}
+
+
+static struct x86_reg get_buffer_ptr( struct translate_sse *p,
+                                      unsigned index_size,
+                                      unsigned var_idx,
+                                      struct x86_reg elt )
+{
+   if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
+      return x86_make_disp(p->machine_EDI,
+                           get_offset(p, &p->instance_id));
+   }
+   if (!index_size && p->nr_buffer_variants == 1) {
+      return p->idx_ESI;
+   }
+   else if (!index_size || p->buffer_variant[var_idx].instance_divisor) {
+      struct x86_reg ptr = p->src_ECX;
+      struct x86_reg buf_ptr = 
+         x86_make_disp(p->machine_EDI,
+                       get_offset(p, &p->buffer_variant[var_idx].ptr));
+      
+      x64_rexw(p->func);
+      x86_mov(p->func, ptr, buf_ptr);
+      return ptr;
+   }
+   else {
+      struct x86_reg ptr = p->src_ECX;
+      const struct translate_buffer_variant *variant = &p->buffer_variant[var_idx];
+
+      struct x86_reg buf_stride = 
+         x86_make_disp(p->machine_EDI,
+                       get_offset(p, &p->buffer[variant->buffer_index].stride));
+
+      struct x86_reg buf_base_ptr = 
+         x86_make_disp(p->machine_EDI,
+                       get_offset(p, &p->buffer[variant->buffer_index].base_ptr));
+
+      struct x86_reg buf_max_index =
+         x86_make_disp(p->machine_EDI,
+                       get_offset(p, &p->buffer[variant->buffer_index].max_index));
+
+
+
+      /* Calculate pointer to current attrib:
+       */
+      switch(index_size)
+      {
+      case 1:
+         x86_movzx8(p->func, ptr, elt);
+         break;
+      case 2:
+         x86_movzx16(p->func, ptr, elt);
+         break;
+      case 4:
+         x86_mov(p->func, ptr, elt);
+         break;
+      }
+
+      /* Clamp to max_index
+       */
+      x86_cmp(p->func, ptr, buf_max_index);
+      x86_cmovcc(p->func, ptr, buf_max_index, cc_AE);
+
+      x86_imul(p->func, ptr, buf_stride);
+      x64_rexw(p->func);
+      x86_add(p->func, ptr, buf_base_ptr);
+      return ptr;
+   }
+}
+
+
+
+static boolean incr_inputs( struct translate_sse *p, 
+                            unsigned index_size )
+{
+   if (!index_size && p->nr_buffer_variants == 1) {
+      struct x86_reg stride = x86_make_disp(p->machine_EDI,
+                                            get_offset(p, &p->buffer[0].stride));
+
+      if (p->buffer_variant[0].instance_divisor == 0) {
+         x64_rexw(p->func);
+         x86_add(p->func, p->idx_ESI, stride);
+         sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));
+      }
+   }
+   else if (!index_size) {
+      unsigned i;
+
+      /* Is this worthwhile??
+       */
+      for (i = 0; i < p->nr_buffer_variants; i++) {
+         struct translate_buffer_variant *variant = &p->buffer_variant[i];
+         struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
+                                                get_offset(p, &variant->ptr));
+         struct x86_reg buf_stride = x86_make_disp(p->machine_EDI,
+                                                   get_offset(p, &p->buffer[variant->buffer_index].stride));
+
+         if (variant->instance_divisor == 0) {
+            x86_mov(p->func, p->tmp_EAX, buf_stride);
+            x64_rexw(p->func);
+            x86_add(p->func, p->tmp_EAX, buf_ptr);
+            if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
+            x64_rexw(p->func);
+            x86_mov(p->func, buf_ptr, p->tmp_EAX);
+         }
+      }
+   } 
+   else {
+      x64_rexw(p->func);
+      x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
+   }
+   
+   return TRUE;
+}
+
+
+/* Build run( struct translate *machine,
+ *            unsigned start,
+ *            unsigned count,
+ *            void *output_buffer )
+ * or
+ *  run_elts( struct translate *machine,
+ *            unsigned *elts,
+ *            unsigned count,
+ *            void *output_buffer )
+ *
+ *  Lots of hardcoding
+ *
+ * EAX -- pointer to current output vertex
+ * ECX -- pointer to current attribute 
+ * 
+ */
+static boolean build_vertex_emit( struct translate_sse *p,
+				  struct x86_function *func,
+				  unsigned index_size )
+{
+   int fixup, label;
+   unsigned j;
+
+   memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const));
+   memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg));
+
+   p->tmp_EAX       = x86_make_reg(file_REG32, reg_AX);
+   p->idx_ESI       = x86_make_reg(file_REG32, reg_SI);
+   p->outbuf_EBX    = x86_make_reg(file_REG32, reg_BX);
+   p->machine_EDI   = x86_make_reg(file_REG32, reg_DI);
+   p->count_EBP     = x86_make_reg(file_REG32, reg_BP);
+   p->tmp2_EDX     = x86_make_reg(file_REG32, reg_DX);
+   p->src_ECX     = x86_make_reg(file_REG32, reg_CX);
+
+   p->func = func;
+
+   x86_init_func(p->func);
+
+   if(x86_target(p->func) == X86_64_WIN64_ABI)
+   {
+	   /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */
+	   sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), x86_make_reg(file_XMM, 6));
+	   sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), x86_make_reg(file_XMM, 7));
+   }
+
+   x86_push(p->func, p->outbuf_EBX);
+   x86_push(p->func, p->count_EBP);
+
+/* on non-Win64 x86-64, these are already in the right registers */
+   if(x86_target(p->func) != X86_64_STD_ABI)
+   {
+      x86_push(p->func, p->machine_EDI);
+      x86_push(p->func, p->idx_ESI);
+
+      x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
+      x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
+   }
+
+   x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
+
+   if(x86_target(p->func) != X86_32)
+      x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
+   else
+      x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
+
+   /* Load instance ID.
+    */
+   if (p->use_instancing) {      
+      x86_mov(p->func,
+              p->tmp2_EDX,
+              x86_fn_arg(p->func, 4));
+      x86_mov(p->func,
+              x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance)),
+              p->tmp2_EDX);
+
+      x86_mov(p->func,
+              p->tmp_EAX,
+              x86_fn_arg(p->func, 5));
+      x86_mov(p->func,
+              x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
+              p->tmp_EAX);
+   }
+
+   /* Get vertex count, compare to zero
+    */
+   x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
+   x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
+   fixup = x86_jcc_forward(p->func, cc_E);
+
+   /* always load, needed or not:
+    */
+   init_inputs(p, index_size);
+
+   /* Note address for loop jump
+    */
+   label = x86_get_label(p->func);
+   {
+      struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);
+      int last_variant = -1;
+      struct x86_reg vb;
+
+      for (j = 0; j < p->translate.key.nr_elements; j++) {
+         const struct translate_element *a = &p->translate.key.element[j];
+         unsigned variant = p->element_to_buffer_variant[j];
+
+         /* Figure out source pointer address:
+          */
+         if (variant != last_variant) {
+            last_variant = variant;
+            vb = get_buffer_ptr(p, index_size, variant, elt);
+         }
+         
+         if (!translate_attr( p, a, 
+                              x86_make_disp(vb, a->input_offset), 
+                              x86_make_disp(p->outbuf_EBX, a->output_offset)))
+            return FALSE;
+      }
+
+      /* Next output vertex:
+       */
+      x64_rexw(p->func);
+      x86_lea(p->func, 
+              p->outbuf_EBX,
+              x86_make_disp(p->outbuf_EBX,
+                            p->translate.key.output_stride));
+
+      /* Incr index
+       */ 
+      incr_inputs( p, index_size );
+   }
+
+   /* decr count, loop if not zero
+    */
+   x86_dec(p->func, p->count_EBP);
+   x86_jcc(p->func, cc_NZ, label);
+
+   /* Exit mmx state?
+    */
+   if (p->func->need_emms)
+      mmx_emms(p->func);
+
+   /* Land forward jump here:
+    */
+   x86_fixup_fwd_jump(p->func, fixup);
+
+   /* Pop regs and return
+    */
+   
+   if(x86_target(p->func) != X86_64_STD_ABI)
+   {
+      x86_pop(p->func, p->idx_ESI);
+      x86_pop(p->func, p->machine_EDI);
+   }
+
+   x86_pop(p->func, p->count_EBP);
+   x86_pop(p->func, p->outbuf_EBX);
+
+   if(x86_target(p->func) == X86_64_WIN64_ABI)
+   {
+	   sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
+	   sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
+   }
+   x86_ret(p->func);
+
+   return TRUE;
+}
+
+
+
+
+
+
+			       
+static void translate_sse_set_buffer( struct translate *translate,
+				unsigned buf,
+				const void *ptr,
+				unsigned stride,
+				unsigned max_index )
+{
+   struct translate_sse *p = (struct translate_sse *)translate;
+
+   if (buf < p->nr_buffers) {
+      p->buffer[buf].base_ptr = (char *)ptr;
+      p->buffer[buf].stride = stride;
+      p->buffer[buf].max_index = max_index;
+   }
+
+   if (0) debug_printf("%s %d/%d: %p %d\n", 
+                       __FUNCTION__, buf, 
+                       p->nr_buffers, 
+                       ptr, stride);
+}
+
+
+static void translate_sse_release( struct translate *translate )
+{
+   struct translate_sse *p = (struct translate_sse *)translate;
+
+   x86_release_func( &p->elt8_func );
+   x86_release_func( &p->elt16_func );
+   x86_release_func( &p->elt_func );
+   x86_release_func( &p->linear_func );
+
+   os_free_aligned(p);
+}
+
+
+struct translate *translate_sse2_create( const struct translate_key *key )
+{
+   struct translate_sse *p = NULL;
+   unsigned i;
+
+   /* this is misnamed, it actually refers to whether rtasm is enabled or not */
+   if (!rtasm_cpu_has_sse())
+      goto fail;
+
+   p = os_malloc_aligned(sizeof(struct translate_sse), 16);
+   if (p == NULL) 
+      goto fail;
+   memset(p, 0, sizeof(*p));
+   memcpy(p->consts, consts, sizeof(consts));
+
+   p->translate.key = *key;
+   p->translate.release = translate_sse_release;
+   p->translate.set_buffer = translate_sse_set_buffer;
+
+   for (i = 0; i < key->nr_elements; i++) {
+      if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
+         unsigned j;
+
+         p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
+
+         if (key->element[i].instance_divisor) {
+            p->use_instancing = TRUE;
+         }
+
+         /*
+          * Map vertex element to vertex buffer variant.
+          */
+         for (j = 0; j < p->nr_buffer_variants; j++) {
+            if (p->buffer_variant[j].buffer_index == key->element[i].input_buffer &&
+                p->buffer_variant[j].instance_divisor == key->element[i].instance_divisor) {
+               break;
+            }
+         }
+         if (j == p->nr_buffer_variants) {
+            p->buffer_variant[j].buffer_index = key->element[i].input_buffer;
+            p->buffer_variant[j].instance_divisor = key->element[i].instance_divisor;
+            p->nr_buffer_variants++;
+         }
+         p->element_to_buffer_variant[i] = j;
+      } else {
+         assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
+
+         p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID;
+      }
+   }
+
+   if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
+
+   if (!build_vertex_emit(p, &p->linear_func, 0))
+      goto fail;
+
+   if (!build_vertex_emit(p, &p->elt_func, 4))
+      goto fail;
+
+   if (!build_vertex_emit(p, &p->elt16_func, 2))
+      goto fail;
+
+   if (!build_vertex_emit(p, &p->elt8_func, 1))
+      goto fail;
+
+   p->translate.run = (run_func) x86_get_func(&p->linear_func);
+   if (p->translate.run == NULL)
+      goto fail;
+
+   p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func);
+   if (p->translate.run_elts == NULL)
+      goto fail;
+
+   p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func);
+   if (p->translate.run_elts16 == NULL)
+      goto fail;
+
+   p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func);
+   if (p->translate.run_elts8 == NULL)
+      goto fail;
+
+   return &p->translate;
+
+ fail:
+   if (p)
+      translate_sse_release( &p->translate );
+
+   return NULL;
+}
+
+
+
+#else
+
+struct translate *translate_sse2_create( const struct translate_key *key )
+{
+   return NULL;
+}
+
+#endif
diff --git a/drivers/video/Gallium/include/pipe/p_config.h b/drivers/video/Gallium/include/pipe/p_config.h
index 6b51160af6..60f3205fc2 100644
--- a/drivers/video/Gallium/include/pipe/p_config.h
+++ b/drivers/video/Gallium/include/pipe/p_config.h
@@ -1,8 +1,8 @@
 /**************************************************************************
- * 
+ *
  * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,11 +10,11 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,24 +22,24 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
 /**
  * @file
  * Gallium configuration defines.
- * 
- * This header file sets several defines based on the compiler, processor 
- * architecture, and operating system being used. These defines should be used 
- * throughout the code to facilitate porting to new platforms. It is likely that 
- * this file is auto-generated by an autoconf-like tool at some point, as some 
- * things cannot be determined by pre-defined environment alone. 
- * 
+ *
+ * This header file sets several defines based on the compiler, processor
+ * architecture, and operating system being used. These defines should be used
+ * throughout the code to facilitate porting to new platforms. It is likely that
+ * this file is auto-generated by an autoconf-like tool at some point, as some
+ * things cannot be determined by pre-defined environment alone.
+ *
  * See also:
  * - http://gcc.gnu.org/onlinedocs/cpp/Common-Predefined-Macros.html
  * - echo | gcc -dM -E - | sort
  * - http://msdn.microsoft.com/en-us/library/b0084kay.aspx
- * 
+ *
  * @author José Fonseca <jrfonseca@tungstengraphics.com>
  */
 
@@ -61,8 +61,8 @@
  * - 1400: Visual C++ 2005
  * - 1310: Visual C++ .NET 2003
  * - 1300: Visual C++ .NET 2002
- * 
- * __MSC__ seems to be an old macro -- it is not pre-defined on recent MSVC 
+ *
+ * __MSC__ seems to be an old macro -- it is not pre-defined on recent MSVC
  * versions.
  */
 #if defined(_MSC_VER) || defined(__MSC__)
@@ -162,7 +162,7 @@
 
 /*
  * Auto-detect the operating system family.
- * 
+ *
  * See subsystem below for a more fine-grained distinction.
  */
 
@@ -212,10 +212,6 @@
 #define PIPE_OS_UNIX
 #endif
 
-#if defined(_WIN32) || defined(WIN32)
-#define PIPE_OS_WINDOWS
-#endif
-
 #if defined(__HAIKU__)
 #define PIPE_OS_HAIKU
 #define PIPE_OS_UNIX
@@ -228,7 +224,7 @@
 
 /*
  * Try to auto-detect the subsystem.
- * 
+ *
  * NOTE: There is no way to auto-detect most of these.
  */