#include "drmP.h"
#include "drm.h"
#include "i915_drm.h"
#include "i915_drv.h"
#include "intel_drv.h"
//#include

#undef mb
#undef rmb
#undef wmb
#define mb() asm volatile("mfence")
#define rmb() asm volatile ("lfence")
#define wmb() asm volatile ("sfence")


typedef struct
{
    struct drm_i915_gem_object *batch;
    struct list_head  objects;
    u32    exec_start;
    u32    exec_len;

}batchbuffer_t;

struct change_domains {
    uint32_t invalidate_domains;
    uint32_t flush_domains;
    uint32_t flush_rings;
    uint32_t flips;
};

/*
 * Set the next domain for the specified object. This
 * may not actually perform the necessary flushing/invaliding though,
 * as that may want to be batched with other set_domain operations
 *
 * This is (we hope) the only really tricky part of gem. The goal
 * is fairly simple -- track which caches hold bits of the object
 * and make sure they remain coherent. A few concrete examples may
 * help to explain how it works. For shorthand, we use the notation
 * (read_domains, write_domain), e.g. (CPU, CPU) to indicate the
 * a pair of read and write domain masks.
 *
 * Case 1: the batch buffer
 *
 *  1. Allocated
 *  2. Written by CPU
 *  3. Mapped to GTT
 *  4. Read by GPU
 *  5. Unmapped from GTT
 *  6. Freed
 *
 *  Let's take these a step at a time
 *
 *  1. Allocated
 *      Pages allocated from the kernel may still have
 *      cache contents, so we set them to (CPU, CPU) always.
 *  2. Written by CPU (using pwrite)
 *      The pwrite function calls set_domain (CPU, CPU) and
 *      this function does nothing (as nothing changes)
 *  3. Mapped by GTT
 *      This function asserts that the object is not
 *      currently in any GPU-based read or write domains
 *  4. Read by GPU
 *      i915_gem_execbuffer calls set_domain (COMMAND, 0).
 *      As write_domain is zero, this function adds in the
 *      current read domains (CPU+COMMAND, 0).
 *      flush_domains is set to CPU.
 *      invalidate_domains is set to COMMAND
 *      clflush is run to get data out of the CPU caches
 *      then i915_dev_set_domain calls i915_gem_flush to
 *      emit an MI_FLUSH and drm_agp_chipset_flush
 *  5. Unmapped from GTT
 *      i915_gem_object_unbind calls set_domain (CPU, CPU)
 *      flush_domains and invalidate_domains end up both zero
 *      so no flushing/invalidating happens
 *  6. Freed
 *      yay, done
 *
 * Case 2: The shared render buffer
 *
 *  1. Allocated
 *  2. Mapped to GTT
 *  3. Read/written by GPU
 *  4. set_domain to (CPU,CPU)
 *  5. Read/written by CPU
 *  6. Read/written by GPU
 *
 *  1. Allocated
 *      Same as last example, (CPU, CPU)
 *  2. Mapped to GTT
 *      Nothing changes (assertions find that it is not in the GPU)
 *  3. Read/written by GPU
 *      execbuffer calls set_domain (RENDER, RENDER)
 *      flush_domains gets CPU
 *      invalidate_domains gets GPU
 *      clflush (obj)
 *      MI_FLUSH and drm_agp_chipset_flush
 *  4. set_domain (CPU, CPU)
 *      flush_domains gets GPU
 *      invalidate_domains gets CPU
 *      wait_rendering (obj) to make sure all drawing is complete.
 *      This will include an MI_FLUSH to get the data from GPU
 *      to memory
 *      clflush (obj) to invalidate the CPU cache
 *      Another MI_FLUSH in i915_gem_flush (eliminate this somehow?)
 *  5. Read/written by CPU
 *      cache lines are loaded and dirtied
 *  6. Read written by GPU
 *      Same as last GPU access
 *
 * Case 3: The constant buffer
 *
 *  1. Allocated
 *  2. Written by CPU
 *  3. Read by GPU
 *  4. Updated (written) by CPU again
 *  5. Read by GPU
 *
 *  1. Allocated
 *      (CPU, CPU)
 *  2. Written by CPU
 *      (CPU, CPU)
 *  3. Read by GPU
 *      (CPU+RENDER, 0)
 *      flush_domains = CPU
 *      invalidate_domains = RENDER
 *      clflush (obj)
 *      MI_FLUSH
 *      drm_agp_chipset_flush
 *  4. Updated (written) by CPU again
 *      (CPU, CPU)
 *      flush_domains = 0 (no previous write domain)
 *      invalidate_domains = 0 (no new read domains)
 *  5. Read by GPU
 *      (CPU+RENDER, 0)
 *      flush_domains = CPU
 *      invalidate_domains = RENDER
 *      clflush (obj)
 *      MI_FLUSH
 *      drm_agp_chipset_flush
 */
static void
i915_gem_object_set_to_gpu_domain(struct drm_i915_gem_object *obj,
                  struct intel_ring_buffer *ring,
                  struct change_domains *cd)
{
    uint32_t invalidate_domains = 0, flush_domains = 0;

    /*
     * If the object isn't moving to a new write domain,
     * let the object stay in multiple read domains
     */
    if (obj->base.pending_write_domain == 0)
        obj->base.pending_read_domains |= obj->base.read_domains;

    /*
     * Flush the current write domain if
     * the new read domains don't match. Invalidate
     * any read domains which differ from the old
     * write domain
     */
    if (obj->base.write_domain &&
        (((obj->base.write_domain != obj->base.pending_read_domains ||
           obj->ring != ring)) ||
         (obj->fenced_gpu_access && !obj->pending_fenced_gpu_access))) {
        flush_domains |= obj->base.write_domain;
        invalidate_domains |=
            obj->base.pending_read_domains & ~obj->base.write_domain;
    }
    /*
     * Invalidate any read caches which may have
     * stale data. That is, any new read domains.
     */
    invalidate_domains |= obj->base.pending_read_domains & ~obj->base.read_domains;
    if ((flush_domains | invalidate_domains) & I915_GEM_DOMAIN_CPU)
        i915_gem_clflush_object(obj);

    if (obj->base.pending_write_domain)
        cd->flips |= atomic_read(&obj->pending_flip);

    /* The actual obj->write_domain will be updated with
     * pending_write_domain after we emit the accumulated flush for all
     * of our domain changes in execbuffers (which clears objects'
     * write_domains).  So if we have a current write domain that we
     * aren't changing, set pending_write_domain to that.
     */
    if (flush_domains == 0 && obj->base.pending_write_domain == 0)
        obj->base.pending_write_domain = obj->base.write_domain;

    cd->invalidate_domains |= invalidate_domains;
    cd->flush_domains |= flush_domains;
    if (flush_domains & I915_GEM_GPU_DOMAINS)
        cd->flush_rings |= obj->ring->id;
    if (invalidate_domains & I915_GEM_GPU_DOMAINS)
        cd->flush_rings |= ring->id;
}

static int
i915_gem_execbuffer_flush(struct drm_device *dev,
              uint32_t invalidate_domains,
              uint32_t flush_domains,
              uint32_t flush_rings)
{
    drm_i915_private_t *dev_priv = dev->dev_private;
    int i, ret;

    if (flush_domains & I915_GEM_DOMAIN_CPU)
        intel_gtt_chipset_flush();

    if (flush_domains & I915_GEM_DOMAIN_GTT)
        wmb();

    if ((flush_domains | invalidate_domains) & I915_GEM_GPU_DOMAINS) {
        for (i = 0; i < I915_NUM_RINGS; i++)
            if (flush_rings & (1 << i)) {
                ret = i915_gem_flush_ring(&dev_priv->ring[i],
                              invalidate_domains,
                              flush_domains);
                if (ret)
                    return ret;
            }
    }

    return 0;
}

static int
i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *ring,
                struct list_head *objects)
{
    struct drm_i915_gem_object *obj;
    struct change_domains cd;
    int ret;

    memset(&cd, 0, sizeof(cd));
    list_for_each_entry(obj, objects, exec_list)
        i915_gem_object_set_to_gpu_domain(obj, ring, &cd);

    if (cd.invalidate_domains | cd.flush_domains) {
        ret = i915_gem_execbuffer_flush(ring->dev,
                        cd.invalidate_domains,
                        cd.flush_domains,
                        cd.flush_rings);
        if (ret)
            return ret;
    }

//    if (cd.flips) {
//        ret = i915_gem_execbuffer_wait_for_flips(ring, cd.flips);
//        if (ret)
//            return ret;
//    }

//    list_for_each_entry(obj, objects, exec_list) {
//        ret = i915_gem_execbuffer_sync_rings(obj, ring);
//        if (ret)
//            return ret;
//    }

    return 0;
}

static void
i915_gem_execbuffer_move_to_active(struct list_head *objects,
                   struct intel_ring_buffer *ring,
                   u32 seqno)
{
    struct drm_i915_gem_object *obj;

    list_for_each_entry(obj, objects, exec_list) {
          u32 old_read = obj->base.read_domains;
          u32 old_write = obj->base.write_domain;


        obj->base.read_domains = obj->base.pending_read_domains;
        obj->base.write_domain = obj->base.pending_write_domain;
        obj->fenced_gpu_access = obj->pending_fenced_gpu_access;

        i915_gem_object_move_to_active(obj, ring, seqno);
        if (obj->base.write_domain) {
            obj->dirty = 1;
            obj->pending_gpu_write = true;
            list_move_tail(&obj->gpu_write_list,
                       &ring->gpu_write_list);
//            intel_mark_busy(ring->dev, obj);
        }

//        trace_i915_gem_object_change_domain(obj, old_read, old_write);
    }
}

static void
i915_gem_execbuffer_retire_commands(struct drm_device *dev,
                    struct intel_ring_buffer *ring)
{
    struct drm_i915_gem_request *request;
    u32 invalidate;

    /*
     * Ensure that the commands in the batch buffer are
     * finished before the interrupt fires.
     *
     * The sampler always gets flushed on i965 (sigh).
     */
    invalidate = I915_GEM_DOMAIN_COMMAND;
    if (INTEL_INFO(dev)->gen >= 4)
        invalidate |= I915_GEM_DOMAIN_SAMPLER;
    if (ring->flush(ring, invalidate, 0)) {
        i915_gem_next_request_seqno(ring);
        return;
    }

    /* Add a breadcrumb for the completion of the batch buffer */
    request = kzalloc(sizeof(*request), GFP_KERNEL);
    if (request == NULL || i915_add_request(ring, NULL, request)) {
        i915_gem_next_request_seqno(ring);
        kfree(request);
    }
}


int exec_batch(struct drm_device *dev, struct intel_ring_buffer *ring,
               batchbuffer_t *exec)
{
    drm_i915_private_t *dev_priv = dev->dev_private;
    struct drm_i915_gem_object *obj;

    u32 seqno;
    int i;
    int ret;

    ring = &dev_priv->ring[RCS];

    mutex_lock(&dev->struct_mutex);

    list_for_each_entry(obj, &exec->objects, exec_list)
    {
        obj->base.pending_read_domains = 0;
        obj->base.pending_write_domain = 0;
    };

    exec->batch->base.pending_read_domains |= I915_GEM_DOMAIN_COMMAND;

    ret = i915_gem_execbuffer_move_to_gpu(ring, &exec->objects);
    if (ret)
        goto err;

    seqno = i915_gem_next_request_seqno(ring);
//    for (i = 0; i < ARRAY_SIZE(ring->sync_seqno); i++) {
//        if (seqno < ring->sync_seqno[i]) {
            /* The GPU can not handle its semaphore value wrapping,
             * so every billion or so execbuffers, we need to stall
             * the GPU in order to reset the counters.
             */
//            ret = i915_gpu_idle(dev);
//            if (ret)
//                goto err;

//            BUG_ON(ring->sync_seqno[i]);
//        }
//    };

    ret = ring->dispatch_execbuffer(ring, exec->exec_start, exec->exec_len);
    if (ret)
        goto err;

    i915_gem_execbuffer_move_to_active(&exec->objects, ring, seqno);
    i915_gem_execbuffer_retire_commands(dev, ring);

err:
    mutex_unlock(&dev->struct_mutex);

    return ret;

};