/*
 * Copyright © 2010 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * Authors:
 *    Xiang Haihao <haihao.xiang@intel.com>
 *
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

#ifndef HAVE_GEN_AVC_SURFACE
#define HAVE_GEN_AVC_SURFACE 1
#endif

#include "intel_batchbuffer.h"
#include "intel_driver.h"

#include "i965_defines.h"
#include "i965_drv_video.h"
#include "i965_avc_bsd.h"
#include "i965_media_h264.h"
#include "i965_media.h"
#include "i965_decoder_utils.h"

static void
i965_avc_bsd_init_avc_bsd_surface(VADriverContextP ctx, 
                                  struct object_surface *obj_surface,
                                  VAPictureParameterBufferH264 *pic_param,
                                  struct i965_h264_context *i965_h264_context)
{
    struct i965_driver_data *i965 = i965_driver_data(ctx);
    GenAvcSurface *avc_bsd_surface = obj_surface->private_data;

    obj_surface->free_private_data = gen_free_avc_surface;

    if (!avc_bsd_surface) {
        avc_bsd_surface = calloc(sizeof(GenAvcSurface), 1);
        assert((obj_surface->size & 0x3f) == 0);
        obj_surface->private_data = avc_bsd_surface;
    }

    avc_bsd_surface->dmv_bottom_flag = (pic_param->pic_fields.bits.field_pic_flag &&
                                        !pic_param->seq_fields.bits.direct_8x8_inference_flag);

    if (avc_bsd_surface->dmv_top == NULL) {
        avc_bsd_surface->dmv_top = dri_bo_alloc(i965->intel.bufmgr,
                                                "direct mv w/r buffer",
                                                DMV_SIZE,
                                                0x1000);
    }

    if (avc_bsd_surface->dmv_bottom_flag &&
        avc_bsd_surface->dmv_bottom == NULL) {
        avc_bsd_surface->dmv_bottom = dri_bo_alloc(i965->intel.bufmgr,
                                                   "direct mv w/r buffer",
                                                   DMV_SIZE,
                                                   0x1000);
    }
}

static void
i965_bsd_ind_obj_base_address(VADriverContextP ctx,
                              struct decode_state *decode_state,
                              int slice,
                              struct i965_h264_context *i965_h264_context)
                              
{
    struct intel_batchbuffer *batch = i965_h264_context->batch;

    dri_bo *ind_bo = decode_state->slice_datas[slice]->bo;

    BEGIN_BCS_BATCH(batch, 3);
    OUT_BCS_BATCH(batch, CMD_BSD_IND_OBJ_BASE_ADDR | (3 - 2));
    OUT_BCS_RELOC(batch, ind_bo,
                  I915_GEM_DOMAIN_INSTRUCTION, 0,
                  0);
    OUT_BCS_BATCH(batch, 0);
    ADVANCE_BCS_BATCH(batch);
}

static void
i965_avc_bsd_img_state(VADriverContextP ctx,
                       struct decode_state *decode_state,
                       struct i965_h264_context *i965_h264_context)
{
    struct intel_batchbuffer *batch = i965_h264_context->batch;
    int qm_present_flag;
    int img_struct;
    int mbaff_frame_flag;
    unsigned int avc_it_command_header;
    unsigned int width_in_mbs, height_in_mbs;
    VAPictureParameterBufferH264 *pic_param;

    if (decode_state->iq_matrix && decode_state->iq_matrix->buffer)
        qm_present_flag = 1;
    else
        qm_present_flag = 0; /* built-in QM matrices */

    assert(decode_state->pic_param && decode_state->pic_param->buffer);
    pic_param = (VAPictureParameterBufferH264 *)decode_state->pic_param->buffer;

    assert(!(pic_param->CurrPic.flags & VA_PICTURE_H264_INVALID));

    if (pic_param->CurrPic.flags & VA_PICTURE_H264_TOP_FIELD)
        img_struct = 1;
    else if (pic_param->CurrPic.flags & VA_PICTURE_H264_BOTTOM_FIELD)
        img_struct = 3;
    else
        img_struct = 0;

    if ((img_struct & 0x1) == 0x1) {
        assert(pic_param->pic_fields.bits.field_pic_flag == 0x1);
    } else {
        assert(pic_param->pic_fields.bits.field_pic_flag == 0x0);
    }

    if (pic_param->seq_fields.bits.frame_mbs_only_flag) { /* a frame containing only frame macroblocks */
        assert(pic_param->seq_fields.bits.mb_adaptive_frame_field_flag == 0);
        assert(pic_param->pic_fields.bits.field_pic_flag == 0);
    } else {
        assert(pic_param->seq_fields.bits.direct_8x8_inference_flag == 1); /* see H.264 spec */
    }

    mbaff_frame_flag = (pic_param->seq_fields.bits.mb_adaptive_frame_field_flag &&
                        !pic_param->pic_fields.bits.field_pic_flag);

    width_in_mbs = ((pic_param->picture_width_in_mbs_minus1 + 1) & 0xff);
    height_in_mbs = ((pic_param->picture_height_in_mbs_minus1 + 1) & 0xff); /* frame height */
                                                                               
    assert(!((width_in_mbs * height_in_mbs) & 0x8000)); /* hardware requirement */

    /* BSD unit doesn't support 4:2:2 and 4:4:4 picture */
    assert(pic_param->seq_fields.bits.chroma_format_idc == 0 || /* monochrome picture */
           pic_param->seq_fields.bits.chroma_format_idc == 1);  /* 4:2:0 */
    assert(pic_param->seq_fields.bits.residual_colour_transform_flag == 0); /* only available for 4:4:4 */

    avc_it_command_header = (CMD_MEDIA_OBJECT_EX | (12 - 2));

    BEGIN_BCS_BATCH(batch, 6);
    OUT_BCS_BATCH(batch, CMD_AVC_BSD_IMG_STATE | (6 - 2));
    OUT_BCS_BATCH(batch, 
                  ((width_in_mbs * height_in_mbs) & 0x7fff));
    OUT_BCS_BATCH(batch, 
                  (height_in_mbs << 16) | 
                  (width_in_mbs << 0));
    OUT_BCS_BATCH(batch, 
                  ((pic_param->second_chroma_qp_index_offset & 0x1f) << 24) |
                  ((pic_param->chroma_qp_index_offset & 0x1f) << 16) | 
                  (SCAN_RASTER_ORDER << 15) | /* AVC ILDB Data */
                  (SCAN_SPECIAL_ORDER << 14) | /* AVC IT Command */
                  (SCAN_RASTER_ORDER << 13) | /* AVC IT Data */
                  (1 << 12) | /* always 1, hardware requirement */
                  (qm_present_flag << 10) |
                  (img_struct << 8) |
                  (16 << 0)); /* FIXME: always support 16 reference frames ??? */
    OUT_BCS_BATCH(batch,
                  (RESIDUAL_DATA_OFFSET << 24) | /* residual data offset */
                  (0 << 17) | /* don't overwrite SRT */
                  (0 << 16) | /* Un-SRT (Unsynchronized Root Thread) */
                  (0 << 12) | /* FIXME: no 16MV ??? */
                  (pic_param->seq_fields.bits.chroma_format_idc << 10) |
                  (i965_h264_context->enable_avc_ildb << 8)  | /* Enable ILDB writing output */
                  (pic_param->pic_fields.bits.entropy_coding_mode_flag << 7) |
                  ((!pic_param->pic_fields.bits.reference_pic_flag) << 6) |
                  (pic_param->pic_fields.bits.constrained_intra_pred_flag << 5) |
                  (pic_param->seq_fields.bits.direct_8x8_inference_flag << 4) |
                  (pic_param->pic_fields.bits.transform_8x8_mode_flag << 3) |
                  (pic_param->seq_fields.bits.frame_mbs_only_flag << 2) |
                  (mbaff_frame_flag << 1) |
                  (pic_param->pic_fields.bits.field_pic_flag << 0));
    OUT_BCS_BATCH(batch, avc_it_command_header);
    ADVANCE_BCS_BATCH(batch);
}

static void
i965_avc_bsd_qm_state(VADriverContextP ctx,
                      struct decode_state *decode_state,
                      struct i965_h264_context *i965_h264_context)
{
    struct intel_batchbuffer *batch = i965_h264_context->batch;
    int cmd_len;
    VAIQMatrixBufferH264 *iq_matrix;
    VAPictureParameterBufferH264 *pic_param;

    if (!decode_state->iq_matrix || !decode_state->iq_matrix->buffer)
        return;

    iq_matrix = (VAIQMatrixBufferH264 *)decode_state->iq_matrix->buffer;

    assert(decode_state->pic_param && decode_state->pic_param->buffer);
    pic_param = (VAPictureParameterBufferH264 *)decode_state->pic_param->buffer;

    cmd_len = 2 + 6 * 4; /* always load six 4x4 scaling matrices */

    if (pic_param->pic_fields.bits.transform_8x8_mode_flag)
        cmd_len += 2 * 16; /* load two 8x8 scaling matrices */

    BEGIN_BCS_BATCH(batch, cmd_len);
    OUT_BCS_BATCH(batch, CMD_AVC_BSD_QM_STATE | (cmd_len - 2));

    if (pic_param->pic_fields.bits.transform_8x8_mode_flag)
        OUT_BCS_BATCH(batch, 
                      (0x0  << 8) | /* don't use default built-in matrices */
                      (0xff << 0)); /* six 4x4 and two 8x8 scaling matrices */
    else
        OUT_BCS_BATCH(batch, 
                      (0x0  << 8) | /* don't use default built-in matrices */
                      (0x3f << 0)); /* six 4x4 scaling matrices */

    intel_batchbuffer_data(batch, &iq_matrix->ScalingList4x4[0][0], 6 * 4 * 4);

    if (pic_param->pic_fields.bits.transform_8x8_mode_flag)
        intel_batchbuffer_data(batch, &iq_matrix->ScalingList8x8[0][0], 2 * 16 * 4);

    ADVANCE_BCS_BATCH(batch);
}

static void
i965_avc_bsd_slice_state(VADriverContextP ctx, 
                         VAPictureParameterBufferH264 *pic_param, 
                         VASliceParameterBufferH264 *slice_param,
                         struct i965_h264_context *i965_h264_context)
{
    struct intel_batchbuffer *batch = i965_h264_context->batch;
    int present_flag, cmd_len, list, j;
    uint8_t ref_idx_state[32];
    char weightoffsets[32 * 6];

    /* don't issue SLICE_STATE for intra-prediction decoding */
    if (slice_param->slice_type == SLICE_TYPE_I ||
        slice_param->slice_type == SLICE_TYPE_SI)
        return;

    cmd_len = 2;

    if (slice_param->slice_type == SLICE_TYPE_P ||
        slice_param->slice_type == SLICE_TYPE_SP) {
        present_flag = PRESENT_REF_LIST0;
        cmd_len += 8;
    } else { 
        present_flag = PRESENT_REF_LIST0 | PRESENT_REF_LIST1;
        cmd_len += 16;
    }

    if ((slice_param->slice_type == SLICE_TYPE_P ||
         slice_param->slice_type == SLICE_TYPE_SP) && 
        (pic_param->pic_fields.bits.weighted_pred_flag == 1)) {
        present_flag |= PRESENT_WEIGHT_OFFSET_L0;
        cmd_len += 48;
    }

    if ((slice_param->slice_type == SLICE_TYPE_B) &&
        (pic_param->pic_fields.bits.weighted_bipred_idc == 1)) {
        present_flag |= PRESENT_WEIGHT_OFFSET_L0 | PRESENT_WEIGHT_OFFSET_L1;
        cmd_len += 96;
    }

    BEGIN_BCS_BATCH(batch, cmd_len);
    OUT_BCS_BATCH(batch, CMD_AVC_BSD_SLICE_STATE | (cmd_len - 2));
    OUT_BCS_BATCH(batch, present_flag);

    for (list = 0; list < 2; list++) {
        int flag, num_va_pics;
        VAPictureH264 *va_pic;

        if (list == 0) {
            flag        = PRESENT_REF_LIST0;
            va_pic      = slice_param->RefPicList0;
            num_va_pics = slice_param->num_ref_idx_l0_active_minus1 + 1;
        } else {
            flag        = PRESENT_REF_LIST1;
            va_pic      = slice_param->RefPicList1;
            num_va_pics = slice_param->num_ref_idx_l1_active_minus1 + 1;
        }

        if (!(present_flag & flag))
            continue;

        gen5_fill_avc_ref_idx_state(
            ref_idx_state,
            va_pic, num_va_pics,
            i965_h264_context->fsid_list
        );            
        intel_batchbuffer_data(batch, ref_idx_state, sizeof(ref_idx_state));
    }

    i965_h264_context->weight128_luma_l0 = 0;
    i965_h264_context->weight128_luma_l1 = 0;
    i965_h264_context->weight128_chroma_l0 = 0;
    i965_h264_context->weight128_chroma_l1 = 0;

    i965_h264_context->weight128_offset0_flag = 0;
    i965_h264_context->weight128_offset0 = 0;

    if (present_flag & PRESENT_WEIGHT_OFFSET_L0) {
        for (j = 0; j < 32; j++) {
            weightoffsets[j * 6 + 0] = slice_param->luma_offset_l0[j];
            weightoffsets[j * 6 + 1] = slice_param->luma_weight_l0[j];
            weightoffsets[j * 6 + 2] = slice_param->chroma_offset_l0[j][0];
            weightoffsets[j * 6 + 3] = slice_param->chroma_weight_l0[j][0];
            weightoffsets[j * 6 + 4] = slice_param->chroma_offset_l0[j][1];
            weightoffsets[j * 6 + 5] = slice_param->chroma_weight_l0[j][1];

            if (pic_param->pic_fields.bits.weighted_pred_flag == 1 ||
                pic_param->pic_fields.bits.weighted_bipred_idc == 1) {
                if (i965_h264_context->use_hw_w128) {
                    if (slice_param->luma_weight_l0[j] == 128)
                        i965_h264_context->weight128_luma_l0 |= (1 << j);

                    if (slice_param->chroma_weight_l0[j][0] == 128 ||
                        slice_param->chroma_weight_l0[j][1] == 128)
                        i965_h264_context->weight128_chroma_l0 |= (1 << j);
                } else {
                    /* FIXME: workaround for weight 128 */
                    if (slice_param->luma_weight_l0[j] == 128 ||
                        slice_param->chroma_weight_l0[j][0] == 128 ||
                        slice_param->chroma_weight_l0[j][1] == 128)
                        i965_h264_context->weight128_offset0_flag = 1;
                }
            }
        }

        intel_batchbuffer_data(batch, weightoffsets, sizeof(weightoffsets));
    }

    if (present_flag & PRESENT_WEIGHT_OFFSET_L1) {
        for (j = 0; j < 32; j++) {
            weightoffsets[j * 6 + 0] = slice_param->luma_offset_l1[j];
            weightoffsets[j * 6 + 1] = slice_param->luma_weight_l1[j];
            weightoffsets[j * 6 + 2] = slice_param->chroma_offset_l1[j][0];
            weightoffsets[j * 6 + 3] = slice_param->chroma_weight_l1[j][0];
            weightoffsets[j * 6 + 4] = slice_param->chroma_offset_l1[j][1];
            weightoffsets[j * 6 + 5] = slice_param->chroma_weight_l1[j][1];

            if (pic_param->pic_fields.bits.weighted_bipred_idc == 1) {
                if (i965_h264_context->use_hw_w128) {
                    if (slice_param->luma_weight_l1[j] == 128)
                        i965_h264_context->weight128_luma_l1 |= (1 << j);

                    if (slice_param->chroma_weight_l1[j][0] == 128 ||
                        slice_param->chroma_weight_l1[j][1] == 128)
                        i965_h264_context->weight128_chroma_l1 |= (1 << j);
                } else {
                    if (slice_param->luma_weight_l0[j] == 128 ||
                        slice_param->chroma_weight_l0[j][0] == 128 ||
                        slice_param->chroma_weight_l0[j][1] == 128)
                        i965_h264_context->weight128_offset0_flag = 1;
                }
            }
        }

        intel_batchbuffer_data(batch, weightoffsets, sizeof(weightoffsets));
    }

    ADVANCE_BCS_BATCH(batch);
}

static void
i965_avc_bsd_buf_base_state(VADriverContextP ctx,
                            VAPictureParameterBufferH264 *pic_param, 
                            VASliceParameterBufferH264 *slice_param,
                            struct i965_h264_context *i965_h264_context)
{
    struct i965_driver_data *i965 = i965_driver_data(ctx);
    struct intel_batchbuffer *batch = i965_h264_context->batch;
    struct i965_avc_bsd_context *i965_avc_bsd_context;
    int i, j;
    VAPictureH264 *va_pic;
    struct object_surface *obj_surface;
    GenAvcSurface *avc_bsd_surface;

    i965_avc_bsd_context = &i965_h264_context->i965_avc_bsd_context;

    BEGIN_BCS_BATCH(batch, 74);
    OUT_BCS_BATCH(batch, CMD_AVC_BSD_BUF_BASE_STATE | (74 - 2));
    OUT_BCS_RELOC(batch, i965_avc_bsd_context->bsd_raw_store.bo,
                  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
                  0);
    OUT_BCS_RELOC(batch, i965_avc_bsd_context->mpr_row_store.bo,
                  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
                  0);
    OUT_BCS_RELOC(batch, i965_h264_context->avc_it_command_mb_info.bo,
                  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
                  i965_h264_context->avc_it_command_mb_info.mbs * i965_h264_context->use_avc_hw_scoreboard * MB_CMD_IN_BYTES);
    OUT_BCS_RELOC(batch, i965_h264_context->avc_it_data.bo,
                  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
                  (i965_h264_context->avc_it_data.write_offset << 6));

    if (i965_h264_context->enable_avc_ildb)
        OUT_BCS_RELOC(batch, i965_h264_context->avc_ildb_data.bo,
                      I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
                      0);
    else
        OUT_BCS_BATCH(batch, 0);

    for (i = 0; i < ARRAY_ELEMS(i965_h264_context->fsid_list); i++) {
        if (i965_h264_context->fsid_list[i].surface_id != VA_INVALID_ID) {
            int found = 0;
            for (j = 0; j < ARRAY_ELEMS(pic_param->ReferenceFrames); j++) {
                va_pic = &pic_param->ReferenceFrames[j];
                
                if (va_pic->flags & VA_PICTURE_H264_INVALID)
                    continue;

                if (va_pic->picture_id == i965_h264_context->fsid_list[i].surface_id) {
                    found = 1;
                    break;
                }
            }

            assert(found == 1);

            if (!(va_pic->flags & VA_PICTURE_H264_INVALID)) {
                obj_surface = SURFACE(va_pic->picture_id);
                assert(obj_surface);
                avc_bsd_surface = obj_surface->private_data;
            
                if (avc_bsd_surface == NULL) {
                    OUT_BCS_BATCH(batch, 0);
                    OUT_BCS_BATCH(batch, 0);
                } else {
                    OUT_BCS_RELOC(batch, avc_bsd_surface->dmv_top,
                                  I915_GEM_DOMAIN_INSTRUCTION, 0,
                                  0);

                    if (avc_bsd_surface->dmv_bottom_flag == 1)
                        OUT_BCS_RELOC(batch, avc_bsd_surface->dmv_bottom,
                                      I915_GEM_DOMAIN_INSTRUCTION, 0,
                                      0);
                    else
                        OUT_BCS_RELOC(batch, avc_bsd_surface->dmv_top,
                                      I915_GEM_DOMAIN_INSTRUCTION, 0,
                                      0);
                }
            } 
        } else {
            OUT_BCS_BATCH(batch, 0);
            OUT_BCS_BATCH(batch, 0);
        }
    }

    va_pic = &pic_param->CurrPic;
    assert(!(va_pic->flags & VA_PICTURE_H264_INVALID));
    obj_surface = SURFACE(va_pic->picture_id);
    assert(obj_surface);
    obj_surface->flags &= ~SURFACE_REF_DIS_MASK;
    obj_surface->flags |= (pic_param->pic_fields.bits.reference_pic_flag ? SURFACE_REFERENCED : 0);
    i965_check_alloc_surface_bo(ctx, obj_surface, 0, VA_FOURCC('N','V','1','2'), SUBSAMPLE_YUV420);

    /* initial uv component for YUV400 case */
    if (pic_param->seq_fields.bits.chroma_format_idc == 0) {
         unsigned int uv_offset = obj_surface->width * obj_surface->height; 
         unsigned int uv_size   = obj_surface->width * obj_surface->height / 2; 

         dri_bo_map(obj_surface->bo, 1);
         memset(obj_surface->bo->virtual + uv_offset, 0x80, uv_size);
         dri_bo_unmap(obj_surface->bo);
    }

    i965_avc_bsd_init_avc_bsd_surface(ctx, obj_surface, pic_param, i965_h264_context);
    avc_bsd_surface = obj_surface->private_data;

    OUT_BCS_RELOC(batch, avc_bsd_surface->dmv_top,
                  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
                  0);

    if (avc_bsd_surface->dmv_bottom_flag == 1)
        OUT_BCS_RELOC(batch, avc_bsd_surface->dmv_bottom,
                      I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
                      0);
    else
        OUT_BCS_RELOC(batch, avc_bsd_surface->dmv_top,
                      I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
                      0);

    /* POC List */
    for (i = 0; i < ARRAY_ELEMS(i965_h264_context->fsid_list); i++) {
        if (i965_h264_context->fsid_list[i].surface_id != VA_INVALID_ID) {
            int found = 0;
            for (j = 0; j < ARRAY_ELEMS(pic_param->ReferenceFrames); j++) {
                va_pic = &pic_param->ReferenceFrames[j];
                
                if (va_pic->flags & VA_PICTURE_H264_INVALID)
                    continue;

                if (va_pic->picture_id == i965_h264_context->fsid_list[i].surface_id) {
                    found = 1;
                    break;
                }
            }

            assert(found == 1);

            if (!(va_pic->flags & VA_PICTURE_H264_INVALID)) {
                OUT_BCS_BATCH(batch, va_pic->TopFieldOrderCnt);
                OUT_BCS_BATCH(batch, va_pic->BottomFieldOrderCnt);
            } 
        } else {
            OUT_BCS_BATCH(batch, 0);
            OUT_BCS_BATCH(batch, 0);
        }
    }

    va_pic = &pic_param->CurrPic;
    OUT_BCS_BATCH(batch, va_pic->TopFieldOrderCnt);
    OUT_BCS_BATCH(batch, va_pic->BottomFieldOrderCnt);

    ADVANCE_BCS_BATCH(batch);
}

static void
g4x_avc_bsd_object(VADriverContextP ctx, 
                   struct decode_state *decode_state,
                   VAPictureParameterBufferH264 *pic_param,
                   VASliceParameterBufferH264 *slice_param,
                   int slice_index,
                   struct i965_h264_context *i965_h264_context)
{
    struct intel_batchbuffer *batch = i965_h264_context->batch;
    int width_in_mbs = pic_param->picture_width_in_mbs_minus1 + 1;
    int height_in_mbs = pic_param->picture_height_in_mbs_minus1 + 1; /* frame height */

    if (slice_param) {
        int encrypted, counter_value, cmd_len;
        int slice_hor_pos, slice_ver_pos;
        int num_ref_idx_l0, num_ref_idx_l1;
        int mbaff_picture = (!pic_param->pic_fields.bits.field_pic_flag &&
                             pic_param->seq_fields.bits.mb_adaptive_frame_field_flag);
        unsigned int slice_data_bit_offset;
        int weighted_pred_idc = 0;
        int first_mb_in_slice = 0;
        int slice_type;

        encrypted = 0; /* FIXME: which flag in VAAPI is used for encryption? */

        if (encrypted) {
            cmd_len = 9;
            counter_value = 0; /* FIXME: ??? */
        } else 
            cmd_len = 8;


        slice_data_bit_offset = avc_get_first_mb_bit_offset_with_epb(
            decode_state->slice_datas[slice_index]->bo,
            slice_param,
            pic_param->pic_fields.bits.entropy_coding_mode_flag
        );

        if (slice_param->slice_type == SLICE_TYPE_I ||
            slice_param->slice_type == SLICE_TYPE_SI)
            slice_type = SLICE_TYPE_I;
        else if (slice_param->slice_type == SLICE_TYPE_P ||
                 slice_param->slice_type == SLICE_TYPE_SP)
            slice_type = SLICE_TYPE_P;
        else {
            assert(slice_param->slice_type == SLICE_TYPE_B);
            slice_type = SLICE_TYPE_B;
        }

        if (slice_type == SLICE_TYPE_I) {
            assert(slice_param->num_ref_idx_l0_active_minus1 == 0);
            assert(slice_param->num_ref_idx_l1_active_minus1 == 0);
            num_ref_idx_l0 = 0;
            num_ref_idx_l1 = 0;
        } else if (slice_type == SLICE_TYPE_P) {
            assert(slice_param->num_ref_idx_l1_active_minus1 == 0);
            num_ref_idx_l0 = slice_param->num_ref_idx_l0_active_minus1 + 1;
            num_ref_idx_l1 = 0;
        } else {
            num_ref_idx_l0 = slice_param->num_ref_idx_l0_active_minus1 + 1;
            num_ref_idx_l1 = slice_param->num_ref_idx_l1_active_minus1 + 1;
        }

        if (slice_type == SLICE_TYPE_P)
            weighted_pred_idc = pic_param->pic_fields.bits.weighted_pred_flag;
        else if (slice_type == SLICE_TYPE_B)
            weighted_pred_idc = pic_param->pic_fields.bits.weighted_bipred_idc;

        first_mb_in_slice = slice_param->first_mb_in_slice << mbaff_picture;
        slice_hor_pos = first_mb_in_slice % width_in_mbs; 
        slice_ver_pos = first_mb_in_slice / width_in_mbs;

        BEGIN_BCS_BATCH(batch, cmd_len);
        OUT_BCS_BATCH(batch, CMD_AVC_BSD_OBJECT | (cmd_len - 2));
        OUT_BCS_BATCH(batch, 
                      (encrypted << 31) |
                      ((slice_param->slice_data_size - (slice_data_bit_offset >> 3)) << 0));
        OUT_BCS_BATCH(batch, 
                      (slice_param->slice_data_offset +
                       (slice_data_bit_offset >> 3)));
        OUT_BCS_BATCH(batch, 
                      (0 << 31) | /* concealment mode: 0->intra 16x16 prediction, 1->inter P Copy */
                      (0 << 14) | /* ignore BSDPrematureComplete Error handling */
                      (0 << 13) | /* FIXME: ??? */
                      (0 << 12) | /* ignore MPR Error handling */
                      (0 << 10) | /* ignore Entropy Error handling */
                      (0 << 8)  | /* ignore MB Header Error handling */
                      (slice_type << 0));
        OUT_BCS_BATCH(batch, 
                      (num_ref_idx_l1 << 24) |
                      (num_ref_idx_l0 << 16) |
                      (slice_param->chroma_log2_weight_denom << 8) |
                      (slice_param->luma_log2_weight_denom << 0));
        OUT_BCS_BATCH(batch, 
                      (weighted_pred_idc << 30) |
                      (slice_param->direct_spatial_mv_pred_flag << 29) |
                      (slice_param->disable_deblocking_filter_idc << 27) |
                      (slice_param->cabac_init_idc << 24) |
                      ((pic_param->pic_init_qp_minus26 + 26 + slice_param->slice_qp_delta) << 16) |
                      ((slice_param->slice_beta_offset_div2 & 0xf) << 8) |
                      ((slice_param->slice_alpha_c0_offset_div2 & 0xf) << 0));
        OUT_BCS_BATCH(batch, 
                      (slice_ver_pos << 24) |
                      (slice_hor_pos << 16) | 
                      (first_mb_in_slice << 0));
        OUT_BCS_BATCH(batch, 
                      (1 << 7) |
                      ((0x7 - (slice_data_bit_offset & 0x7)) << 0));

        if (encrypted) {
            OUT_BCS_BATCH(batch, counter_value);
        }

        ADVANCE_BCS_BATCH(batch); 
    } else {
        BEGIN_BCS_BATCH(batch, 8); 
        OUT_BCS_BATCH(batch, CMD_AVC_BSD_OBJECT | (8 - 2));
        OUT_BCS_BATCH(batch, 0); /* indirect data length for phantom slice is 0 */
        OUT_BCS_BATCH(batch, 0); /* indirect data start address for phantom slice is 0 */
        OUT_BCS_BATCH(batch, 0);
        OUT_BCS_BATCH(batch, 0);
        OUT_BCS_BATCH(batch, 0);
        OUT_BCS_BATCH(batch, width_in_mbs * height_in_mbs / (1 + !!pic_param->pic_fields.bits.field_pic_flag));
        OUT_BCS_BATCH(batch, 0);
        ADVANCE_BCS_BATCH(batch);
    }
}

static void
ironlake_avc_bsd_object(VADriverContextP ctx, 
                        struct decode_state *decode_state,
                        VAPictureParameterBufferH264 *pic_param,
                        VASliceParameterBufferH264 *slice_param,
                        int slice_index,
                        struct i965_h264_context *i965_h264_context)
{
    struct intel_batchbuffer *batch = i965_h264_context->batch;
    int width_in_mbs = pic_param->picture_width_in_mbs_minus1 + 1;
    int height_in_mbs = pic_param->picture_height_in_mbs_minus1 + 1; /* frame height */

    if (slice_param) {
        int encrypted, counter_value;
        int slice_hor_pos, slice_ver_pos;
        int num_ref_idx_l0, num_ref_idx_l1;
        int mbaff_picture = (!pic_param->pic_fields.bits.field_pic_flag &&
                             pic_param->seq_fields.bits.mb_adaptive_frame_field_flag);
        unsigned int slice_data_bit_offset;
        int weighted_pred_idc = 0;
        int first_mb_in_slice;
        int slice_type;

        encrypted = 0; /* FIXME: which flag in VAAPI is used for encryption? */

        if (encrypted) {
            counter_value = 0; /* FIXME: ??? */
        } else 
            counter_value = 0;

        slice_data_bit_offset = avc_get_first_mb_bit_offset_with_epb(
            decode_state->slice_datas[slice_index]->bo,
            slice_param,
            pic_param->pic_fields.bits.entropy_coding_mode_flag
        );

        if (slice_param->slice_type == SLICE_TYPE_I ||
            slice_param->slice_type == SLICE_TYPE_SI)
            slice_type = SLICE_TYPE_I;
        else if (slice_param->slice_type == SLICE_TYPE_P ||
                 slice_param->slice_type == SLICE_TYPE_SP)
            slice_type = SLICE_TYPE_P;
        else {
            assert(slice_param->slice_type == SLICE_TYPE_B);
            slice_type = SLICE_TYPE_B;
        }

        if (slice_type == SLICE_TYPE_I) {
            assert(slice_param->num_ref_idx_l0_active_minus1 == 0);
            assert(slice_param->num_ref_idx_l1_active_minus1 == 0);
            num_ref_idx_l0 = 0;
            num_ref_idx_l1 = 0;
        } else if (slice_type == SLICE_TYPE_P) {
            assert(slice_param->num_ref_idx_l1_active_minus1 == 0);
            num_ref_idx_l0 = slice_param->num_ref_idx_l0_active_minus1 + 1;
            num_ref_idx_l1 = 0;
        } else {
            num_ref_idx_l0 = slice_param->num_ref_idx_l0_active_minus1 + 1;
            num_ref_idx_l1 = slice_param->num_ref_idx_l1_active_minus1 + 1;
        }

        if (slice_type == SLICE_TYPE_P)
            weighted_pred_idc = pic_param->pic_fields.bits.weighted_pred_flag;
        else if (slice_type == SLICE_TYPE_B)
            weighted_pred_idc = pic_param->pic_fields.bits.weighted_bipred_idc;

        first_mb_in_slice = slice_param->first_mb_in_slice << mbaff_picture;
        slice_hor_pos = first_mb_in_slice % width_in_mbs; 
        slice_ver_pos = first_mb_in_slice / width_in_mbs;

        BEGIN_BCS_BATCH(batch, 16);
        OUT_BCS_BATCH(batch, CMD_AVC_BSD_OBJECT | (16 - 2));
        OUT_BCS_BATCH(batch, 
                      (encrypted << 31) |
                      (0 << 30) | /* FIXME: packet based bit stream */
                      (0 << 29) | /* FIXME: packet format */
                      ((slice_param->slice_data_size - (slice_data_bit_offset >> 3)) << 0));
        OUT_BCS_BATCH(batch, 
                      (slice_param->slice_data_offset +
                       (slice_data_bit_offset >> 3)));
        OUT_BCS_BATCH(batch, 
                      (0 << 31) | /* concealment mode: 0->intra 16x16 prediction, 1->inter P Copy */
                      (0 << 14) | /* ignore BSDPrematureComplete Error handling */
                      (0 << 13) | /* FIXME: ??? */
                      (0 << 12) | /* ignore MPR Error handling */
                      (0 << 10) | /* ignore Entropy Error handling */
                      (0 << 8)  | /* ignore MB Header Error handling */
                      (slice_type << 0));
        OUT_BCS_BATCH(batch, 
                      (num_ref_idx_l1 << 24) |
                      (num_ref_idx_l0 << 16) |
                      (slice_param->chroma_log2_weight_denom << 8) |
                      (slice_param->luma_log2_weight_denom << 0));
        OUT_BCS_BATCH(batch, 
                      (weighted_pred_idc << 30) |
                      (slice_param->direct_spatial_mv_pred_flag << 29) |
                      (slice_param->disable_deblocking_filter_idc << 27) |
                      (slice_param->cabac_init_idc << 24) |
                      ((pic_param->pic_init_qp_minus26 + 26 + slice_param->slice_qp_delta) << 16) |
                      ((slice_param->slice_beta_offset_div2 & 0xf) << 8) |
                      ((slice_param->slice_alpha_c0_offset_div2 & 0xf) << 0));
        OUT_BCS_BATCH(batch, 
                      (slice_ver_pos << 24) |
                      (slice_hor_pos << 16) | 
                      (first_mb_in_slice << 0));
        OUT_BCS_BATCH(batch, 
                      (1 << 7) |
                      ((0x7 - (slice_data_bit_offset & 0x7)) << 0));
        OUT_BCS_BATCH(batch, counter_value);
        
        /* FIXME: dw9-dw11 */
        OUT_BCS_BATCH(batch, 0);
        OUT_BCS_BATCH(batch, 0);
        OUT_BCS_BATCH(batch, 0);
        OUT_BCS_BATCH(batch, i965_h264_context->weight128_luma_l0);
        OUT_BCS_BATCH(batch, i965_h264_context->weight128_luma_l1);
        OUT_BCS_BATCH(batch, i965_h264_context->weight128_chroma_l0);
        OUT_BCS_BATCH(batch, i965_h264_context->weight128_chroma_l1);

        ADVANCE_BCS_BATCH(batch); 
    } else {
        BEGIN_BCS_BATCH(batch, 16);
        OUT_BCS_BATCH(batch, CMD_AVC_BSD_OBJECT | (16 - 2));
        OUT_BCS_BATCH(batch, 0); /* indirect data length for phantom slice is 0 */
        OUT_BCS_BATCH(batch, 0); /* indirect data start address for phantom slice is 0 */
        OUT_BCS_BATCH(batch, 0);
        OUT_BCS_BATCH(batch, 0);
        OUT_BCS_BATCH(batch, 0);
        OUT_BCS_BATCH(batch, width_in_mbs * height_in_mbs / (1 + !!pic_param->pic_fields.bits.field_pic_flag));
        OUT_BCS_BATCH(batch, 0);
        OUT_BCS_BATCH(batch, 0);
        OUT_BCS_BATCH(batch, 0);
        OUT_BCS_BATCH(batch, 0);
        OUT_BCS_BATCH(batch, 0);
        OUT_BCS_BATCH(batch, 0);
        OUT_BCS_BATCH(batch, 0);
        OUT_BCS_BATCH(batch, 0);
        OUT_BCS_BATCH(batch, 0);
        ADVANCE_BCS_BATCH(batch);
    }
}

static void
i965_avc_bsd_object(VADriverContextP ctx, 
                    struct decode_state *decode_state,
                    VAPictureParameterBufferH264 *pic_param,
                    VASliceParameterBufferH264 *slice_param,
                    int slice_index,
                    struct i965_h264_context *i965_h264_context)
{
    struct i965_driver_data *i965 = i965_driver_data(ctx);

    if (IS_IRONLAKE(i965->intel.device_id))
        ironlake_avc_bsd_object(ctx, decode_state, pic_param, slice_param, slice_index, i965_h264_context);
    else
        g4x_avc_bsd_object(ctx, decode_state, pic_param, slice_param, slice_index, i965_h264_context);
}

static void
i965_avc_bsd_phantom_slice(VADriverContextP ctx, 
                           struct decode_state *decode_state,
                           VAPictureParameterBufferH264 *pic_param,
                           struct i965_h264_context *i965_h264_context)
{
    i965_avc_bsd_object(ctx, decode_state, pic_param, NULL, 0, i965_h264_context);
}

static void
i965_avc_bsd_frame_store_index(VADriverContextP ctx,
                               VAPictureParameterBufferH264 *pic_param,
                               struct i965_h264_context *i965_h264_context)
{
    struct i965_driver_data *i965 = i965_driver_data(ctx);
    int i, j;

    assert(ARRAY_ELEMS(i965_h264_context->fsid_list) == ARRAY_ELEMS(pic_param->ReferenceFrames));

    for (i = 0; i < ARRAY_ELEMS(i965_h264_context->fsid_list); i++) {
        int found = 0;

        if (i965_h264_context->fsid_list[i].surface_id == VA_INVALID_ID)
            continue;

        for (j = 0; j < ARRAY_ELEMS(pic_param->ReferenceFrames); j++) {
            VAPictureH264 *ref_pic = &pic_param->ReferenceFrames[j];
            if (ref_pic->flags & VA_PICTURE_H264_INVALID)
                continue;

            if (i965_h264_context->fsid_list[i].surface_id == ref_pic->picture_id) {
                found = 1;
                break;
            }
        }

        if (!found) {
            struct object_surface *obj_surface = SURFACE(i965_h264_context->fsid_list[i].surface_id);
            obj_surface->flags &= ~SURFACE_REFERENCED;

            if ((obj_surface->flags & SURFACE_ALL_MASK) == SURFACE_DISPLAYED) {
                dri_bo_unreference(obj_surface->bo);
                obj_surface->bo = NULL;
                obj_surface->flags &= ~SURFACE_REF_DIS_MASK;
            }

            if (obj_surface->free_private_data)
                obj_surface->free_private_data(&obj_surface->private_data);

            i965_h264_context->fsid_list[i].surface_id = VA_INVALID_ID;
            i965_h264_context->fsid_list[i].frame_store_id = -1;
        }
    }

    for (i = 0; i < ARRAY_ELEMS(pic_param->ReferenceFrames); i++) {
        VAPictureH264 *ref_pic = &pic_param->ReferenceFrames[i];
        int found = 0;

        if (ref_pic->flags & VA_PICTURE_H264_INVALID)
            continue;

        for (j = 0; j < ARRAY_ELEMS(i965_h264_context->fsid_list); j++) {
            if (i965_h264_context->fsid_list[j].surface_id == VA_INVALID_ID)
                continue;
            
            if (i965_h264_context->fsid_list[j].surface_id == ref_pic->picture_id) {
                found = 1;
                break;
            }
        }

        if (!found) {
            int frame_idx;
            struct object_surface *obj_surface = SURFACE(ref_pic->picture_id);
            assert(obj_surface);
            i965_check_alloc_surface_bo(ctx, obj_surface, 0, VA_FOURCC('N','V','1','2'), SUBSAMPLE_YUV420);
            
            for (frame_idx = 0; frame_idx < ARRAY_ELEMS(i965_h264_context->fsid_list); frame_idx++) {
                for (j = 0; j < ARRAY_ELEMS(i965_h264_context->fsid_list); j++) {
                    if (i965_h264_context->fsid_list[j].surface_id == VA_INVALID_ID)
                        continue;

                    if (i965_h264_context->fsid_list[j].frame_store_id == frame_idx)
                        break;
                }

                if (j == ARRAY_ELEMS(i965_h264_context->fsid_list))
                    break;
            }

            assert(frame_idx < ARRAY_ELEMS(i965_h264_context->fsid_list));

            for (j = 0; j < ARRAY_ELEMS(i965_h264_context->fsid_list); j++) {
                if (i965_h264_context->fsid_list[j].surface_id == VA_INVALID_ID) {
                    i965_h264_context->fsid_list[j].surface_id = ref_pic->picture_id;
                    i965_h264_context->fsid_list[j].frame_store_id = frame_idx;
                    break;
                }
            }
        }
    }

    for (i = 0; i < ARRAY_ELEMS(i965_h264_context->fsid_list) - 1; i++) {
        if (i965_h264_context->fsid_list[i].surface_id != VA_INVALID_ID &&
            i965_h264_context->fsid_list[i].frame_store_id == i)
            continue;

        for (j = i + 1; j < ARRAY_ELEMS(i965_h264_context->fsid_list); j++) {
            if (i965_h264_context->fsid_list[j].surface_id != VA_INVALID_ID &&
                i965_h264_context->fsid_list[j].frame_store_id == i) {
                VASurfaceID id = i965_h264_context->fsid_list[i].surface_id;
                int frame_idx = i965_h264_context->fsid_list[i].frame_store_id;

                i965_h264_context->fsid_list[i].surface_id = i965_h264_context->fsid_list[j].surface_id;
                i965_h264_context->fsid_list[i].frame_store_id = i965_h264_context->fsid_list[j].frame_store_id;
                i965_h264_context->fsid_list[j].surface_id = id;
                i965_h264_context->fsid_list[j].frame_store_id = frame_idx;
                break;
            }
        }
    }
}

void 
i965_avc_bsd_pipeline(VADriverContextP ctx, struct decode_state *decode_state, void *h264_context)
{
    struct i965_h264_context *i965_h264_context = (struct i965_h264_context *)h264_context;
    struct intel_batchbuffer *batch = i965_h264_context->batch;
    VAPictureParameterBufferH264 *pic_param;
    VASliceParameterBufferH264 *slice_param;
    int i, j;

    assert(decode_state->pic_param && decode_state->pic_param->buffer);
    pic_param = (VAPictureParameterBufferH264 *)decode_state->pic_param->buffer;
    i965_avc_bsd_frame_store_index(ctx, pic_param, i965_h264_context);

    i965_h264_context->enable_avc_ildb = 0;
    i965_h264_context->picture.i_flag = 1;

    for (j = 0; j < decode_state->num_slice_params && i965_h264_context->enable_avc_ildb == 0; j++) {
        assert(decode_state->slice_params && decode_state->slice_params[j]->buffer);
        slice_param = (VASliceParameterBufferH264 *)decode_state->slice_params[j]->buffer;

        for (i = 0; i < decode_state->slice_params[j]->num_elements; i++) {
            assert(slice_param->slice_data_flag == VA_SLICE_DATA_FLAG_ALL);
            assert((slice_param->slice_type == SLICE_TYPE_I) ||
                   (slice_param->slice_type == SLICE_TYPE_SI) ||
                   (slice_param->slice_type == SLICE_TYPE_P) ||
                   (slice_param->slice_type == SLICE_TYPE_SP) ||
                   (slice_param->slice_type == SLICE_TYPE_B));

            if (slice_param->disable_deblocking_filter_idc != 1) {
                i965_h264_context->enable_avc_ildb = 1;
                break;
            }

            slice_param++;
        }
    }

    intel_batchbuffer_start_atomic_bcs(batch, 0x1000);

    i965_avc_bsd_img_state(ctx, decode_state, i965_h264_context);
    i965_avc_bsd_qm_state(ctx, decode_state, i965_h264_context);

    for (j = 0; j < decode_state->num_slice_params; j++) {
        assert(decode_state->slice_params && decode_state->slice_params[j]->buffer);
        slice_param = (VASliceParameterBufferH264 *)decode_state->slice_params[j]->buffer;

        i965_bsd_ind_obj_base_address(ctx, decode_state, j, i965_h264_context);

        for (i = 0; i < decode_state->slice_params[j]->num_elements; i++) {
            assert(slice_param->slice_data_flag == VA_SLICE_DATA_FLAG_ALL);
            assert((slice_param->slice_type == SLICE_TYPE_I) ||
                   (slice_param->slice_type == SLICE_TYPE_SI) ||
                   (slice_param->slice_type == SLICE_TYPE_P) ||
                   (slice_param->slice_type == SLICE_TYPE_SP) ||
                   (slice_param->slice_type == SLICE_TYPE_B));

            if (i965_h264_context->picture.i_flag && 
                (slice_param->slice_type != SLICE_TYPE_I ||
                 slice_param->slice_type != SLICE_TYPE_SI))
                i965_h264_context->picture.i_flag = 0;

            i965_avc_bsd_slice_state(ctx, pic_param, slice_param, i965_h264_context);
            i965_avc_bsd_buf_base_state(ctx, pic_param, slice_param, i965_h264_context);
            i965_avc_bsd_object(ctx, decode_state, pic_param, slice_param, j, i965_h264_context);
            slice_param++;
        }
    }

    i965_avc_bsd_phantom_slice(ctx, decode_state, pic_param, i965_h264_context);
    intel_batchbuffer_emit_mi_flush(batch);
    intel_batchbuffer_end_atomic(batch);
    intel_batchbuffer_flush(batch);
}

void 
i965_avc_bsd_decode_init(VADriverContextP ctx, void *h264_context)
{
    struct i965_driver_data *i965 = i965_driver_data(ctx);
    struct i965_h264_context *i965_h264_context = (struct i965_h264_context *)h264_context;
    struct i965_avc_bsd_context *i965_avc_bsd_context;
    dri_bo *bo;

    assert(i965_h264_context);
    i965_avc_bsd_context = &i965_h264_context->i965_avc_bsd_context;

    dri_bo_unreference(i965_avc_bsd_context->bsd_raw_store.bo);
    bo = dri_bo_alloc(i965->intel.bufmgr,
                      "bsd raw store",
                      0x3000, /* at least 11520 bytes to support 120 MBs per row */
                      64);
    assert(bo);
    i965_avc_bsd_context->bsd_raw_store.bo = bo;

    dri_bo_unreference(i965_avc_bsd_context->mpr_row_store.bo);
    bo = dri_bo_alloc(i965->intel.bufmgr,
                      "mpr row store",
                      0x2000, /* at least 7680 bytes to support 120 MBs per row */
                      64);
    assert(bo);
    i965_avc_bsd_context->mpr_row_store.bo = bo;
}

Bool 
i965_avc_bsd_ternimate(struct i965_avc_bsd_context *i965_avc_bsd_context)
{
    dri_bo_unreference(i965_avc_bsd_context->bsd_raw_store.bo);
    dri_bo_unreference(i965_avc_bsd_context->mpr_row_store.bo);

    return True;
}