#include "drmP.h" #include "drm.h" #include "i915_drm.h" #include "i915_drv.h" #include "intel_drv.h" //#include #undef mb #undef rmb #undef wmb #define mb() asm volatile("mfence") #define rmb() asm volatile ("lfence") #define wmb() asm volatile ("sfence") typedef struct { struct drm_i915_gem_object *batch; struct list_head objects; u32 exec_start; u32 exec_len; }batchbuffer_t; struct change_domains { uint32_t invalidate_domains; uint32_t flush_domains; uint32_t flush_rings; uint32_t flips; }; /* * Set the next domain for the specified object. This * may not actually perform the necessary flushing/invaliding though, * as that may want to be batched with other set_domain operations * * This is (we hope) the only really tricky part of gem. The goal * is fairly simple -- track which caches hold bits of the object * and make sure they remain coherent. A few concrete examples may * help to explain how it works. For shorthand, we use the notation * (read_domains, write_domain), e.g. (CPU, CPU) to indicate the * a pair of read and write domain masks. * * Case 1: the batch buffer * * 1. Allocated * 2. Written by CPU * 3. Mapped to GTT * 4. Read by GPU * 5. Unmapped from GTT * 6. Freed * * Let's take these a step at a time * * 1. Allocated * Pages allocated from the kernel may still have * cache contents, so we set them to (CPU, CPU) always. * 2. Written by CPU (using pwrite) * The pwrite function calls set_domain (CPU, CPU) and * this function does nothing (as nothing changes) * 3. Mapped by GTT * This function asserts that the object is not * currently in any GPU-based read or write domains * 4. Read by GPU * i915_gem_execbuffer calls set_domain (COMMAND, 0). * As write_domain is zero, this function adds in the * current read domains (CPU+COMMAND, 0). * flush_domains is set to CPU. * invalidate_domains is set to COMMAND * clflush is run to get data out of the CPU caches * then i915_dev_set_domain calls i915_gem_flush to * emit an MI_FLUSH and drm_agp_chipset_flush * 5. Unmapped from GTT * i915_gem_object_unbind calls set_domain (CPU, CPU) * flush_domains and invalidate_domains end up both zero * so no flushing/invalidating happens * 6. Freed * yay, done * * Case 2: The shared render buffer * * 1. Allocated * 2. Mapped to GTT * 3. Read/written by GPU * 4. set_domain to (CPU,CPU) * 5. Read/written by CPU * 6. Read/written by GPU * * 1. Allocated * Same as last example, (CPU, CPU) * 2. Mapped to GTT * Nothing changes (assertions find that it is not in the GPU) * 3. Read/written by GPU * execbuffer calls set_domain (RENDER, RENDER) * flush_domains gets CPU * invalidate_domains gets GPU * clflush (obj) * MI_FLUSH and drm_agp_chipset_flush * 4. set_domain (CPU, CPU) * flush_domains gets GPU * invalidate_domains gets CPU * wait_rendering (obj) to make sure all drawing is complete. * This will include an MI_FLUSH to get the data from GPU * to memory * clflush (obj) to invalidate the CPU cache * Another MI_FLUSH in i915_gem_flush (eliminate this somehow?) * 5. Read/written by CPU * cache lines are loaded and dirtied * 6. Read written by GPU * Same as last GPU access * * Case 3: The constant buffer * * 1. Allocated * 2. Written by CPU * 3. Read by GPU * 4. Updated (written) by CPU again * 5. Read by GPU * * 1. Allocated * (CPU, CPU) * 2. Written by CPU * (CPU, CPU) * 3. Read by GPU * (CPU+RENDER, 0) * flush_domains = CPU * invalidate_domains = RENDER * clflush (obj) * MI_FLUSH * drm_agp_chipset_flush * 4. Updated (written) by CPU again * (CPU, CPU) * flush_domains = 0 (no previous write domain) * invalidate_domains = 0 (no new read domains) * 5. Read by GPU * (CPU+RENDER, 0) * flush_domains = CPU * invalidate_domains = RENDER * clflush (obj) * MI_FLUSH * drm_agp_chipset_flush */ static void i915_gem_object_set_to_gpu_domain(struct drm_i915_gem_object *obj, struct intel_ring_buffer *ring, struct change_domains *cd) { uint32_t invalidate_domains = 0, flush_domains = 0; /* * If the object isn't moving to a new write domain, * let the object stay in multiple read domains */ if (obj->base.pending_write_domain == 0) obj->base.pending_read_domains |= obj->base.read_domains; /* * Flush the current write domain if * the new read domains don't match. Invalidate * any read domains which differ from the old * write domain */ if (obj->base.write_domain && (((obj->base.write_domain != obj->base.pending_read_domains || obj->ring != ring)) || (obj->fenced_gpu_access && !obj->pending_fenced_gpu_access))) { flush_domains |= obj->base.write_domain; invalidate_domains |= obj->base.pending_read_domains & ~obj->base.write_domain; } /* * Invalidate any read caches which may have * stale data. That is, any new read domains. */ invalidate_domains |= obj->base.pending_read_domains & ~obj->base.read_domains; if ((flush_domains | invalidate_domains) & I915_GEM_DOMAIN_CPU) i915_gem_clflush_object(obj); if (obj->base.pending_write_domain) cd->flips |= atomic_read(&obj->pending_flip); /* The actual obj->write_domain will be updated with * pending_write_domain after we emit the accumulated flush for all * of our domain changes in execbuffers (which clears objects' * write_domains). So if we have a current write domain that we * aren't changing, set pending_write_domain to that. */ if (flush_domains == 0 && obj->base.pending_write_domain == 0) obj->base.pending_write_domain = obj->base.write_domain; cd->invalidate_domains |= invalidate_domains; cd->flush_domains |= flush_domains; if (flush_domains & I915_GEM_GPU_DOMAINS) cd->flush_rings |= obj->ring->id; if (invalidate_domains & I915_GEM_GPU_DOMAINS) cd->flush_rings |= ring->id; } static int i915_gem_execbuffer_flush(struct drm_device *dev, uint32_t invalidate_domains, uint32_t flush_domains, uint32_t flush_rings) { drm_i915_private_t *dev_priv = dev->dev_private; int i, ret; if (flush_domains & I915_GEM_DOMAIN_CPU) intel_gtt_chipset_flush(); if (flush_domains & I915_GEM_DOMAIN_GTT) wmb(); if ((flush_domains | invalidate_domains) & I915_GEM_GPU_DOMAINS) { for (i = 0; i < I915_NUM_RINGS; i++) if (flush_rings & (1 << i)) { ret = i915_gem_flush_ring(&dev_priv->ring[i], invalidate_domains, flush_domains); if (ret) return ret; } } return 0; } static int i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *ring, struct list_head *objects) { struct drm_i915_gem_object *obj; struct change_domains cd; int ret; memset(&cd, 0, sizeof(cd)); list_for_each_entry(obj, objects, exec_list) i915_gem_object_set_to_gpu_domain(obj, ring, &cd); if (cd.invalidate_domains | cd.flush_domains) { ret = i915_gem_execbuffer_flush(ring->dev, cd.invalidate_domains, cd.flush_domains, cd.flush_rings); if (ret) return ret; } // if (cd.flips) { // ret = i915_gem_execbuffer_wait_for_flips(ring, cd.flips); // if (ret) // return ret; // } // list_for_each_entry(obj, objects, exec_list) { // ret = i915_gem_execbuffer_sync_rings(obj, ring); // if (ret) // return ret; // } return 0; } static void i915_gem_execbuffer_move_to_active(struct list_head *objects, struct intel_ring_buffer *ring, u32 seqno) { struct drm_i915_gem_object *obj; list_for_each_entry(obj, objects, exec_list) { u32 old_read = obj->base.read_domains; u32 old_write = obj->base.write_domain; obj->base.read_domains = obj->base.pending_read_domains; obj->base.write_domain = obj->base.pending_write_domain; obj->fenced_gpu_access = obj->pending_fenced_gpu_access; i915_gem_object_move_to_active(obj, ring, seqno); if (obj->base.write_domain) { obj->dirty = 1; obj->pending_gpu_write = true; list_move_tail(&obj->gpu_write_list, &ring->gpu_write_list); // intel_mark_busy(ring->dev, obj); } // trace_i915_gem_object_change_domain(obj, old_read, old_write); } } static void i915_gem_execbuffer_retire_commands(struct drm_device *dev, struct intel_ring_buffer *ring) { struct drm_i915_gem_request *request; u32 invalidate; /* * Ensure that the commands in the batch buffer are * finished before the interrupt fires. * * The sampler always gets flushed on i965 (sigh). */ invalidate = I915_GEM_DOMAIN_COMMAND; if (INTEL_INFO(dev)->gen >= 4) invalidate |= I915_GEM_DOMAIN_SAMPLER; if (ring->flush(ring, invalidate, 0)) { i915_gem_next_request_seqno(ring); return; } /* Add a breadcrumb for the completion of the batch buffer */ request = kzalloc(sizeof(*request), GFP_KERNEL); if (request == NULL || i915_add_request(ring, NULL, request)) { i915_gem_next_request_seqno(ring); kfree(request); } } int exec_batch(struct drm_device *dev, struct intel_ring_buffer *ring, batchbuffer_t *exec) { drm_i915_private_t *dev_priv = dev->dev_private; struct drm_i915_gem_object *obj; u32 seqno; int i; int ret; ring = &dev_priv->ring[RCS]; mutex_lock(&dev->struct_mutex); list_for_each_entry(obj, &exec->objects, exec_list) { obj->base.pending_read_domains = 0; obj->base.pending_write_domain = 0; }; exec->batch->base.pending_read_domains |= I915_GEM_DOMAIN_COMMAND; ret = i915_gem_execbuffer_move_to_gpu(ring, &exec->objects); if (ret) goto err; seqno = i915_gem_next_request_seqno(ring); // for (i = 0; i < ARRAY_SIZE(ring->sync_seqno); i++) { // if (seqno < ring->sync_seqno[i]) { /* The GPU can not handle its semaphore value wrapping, * so every billion or so execbuffers, we need to stall * the GPU in order to reset the counters. */ // ret = i915_gpu_idle(dev); // if (ret) // goto err; // BUG_ON(ring->sync_seqno[i]); // } // }; ret = ring->dispatch_execbuffer(ring, exec->exec_start, exec->exec_len); if (ret) goto err; i915_gem_execbuffer_move_to_active(&exec->objects, ring, seqno); i915_gem_execbuffer_retire_commands(dev, ring); err: mutex_unlock(&dev->struct_mutex); return ret; };