/*
 * Copyright © 2010-2011 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * Authors:
 *    Zhao Yakui <yakui.zhao@intel.com>
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <assert.h>

#include "intel_batchbuffer.h"
#include "intel_driver.h"

#include "i965_defines.h"
#include "i965_drv_video.h"
#include "i965_encoder.h"
#include "gen6_vme.h"
#include "gen6_mfc.h"
#ifdef SURFACE_STATE_PADDED_SIZE
#undef SURFACE_STATE_PADDED_SIZE
#endif

#define VME_MSG_LENGTH		32

#define SURFACE_STATE_PADDED_SIZE               SURFACE_STATE_PADDED_SIZE_GEN7
#define SURFACE_STATE_OFFSET(index)             (SURFACE_STATE_PADDED_SIZE * index)
#define BINDING_TABLE_OFFSET(index)             (SURFACE_STATE_OFFSET(MAX_MEDIA_SURFACES_GEN6) + sizeof(unsigned int) * index)

#define CURBE_ALLOCATION_SIZE   37              /* in 256-bit */
#define CURBE_TOTAL_DATA_LENGTH (4 * 32)        /* in byte, it should be less than or equal to CURBE_ALLOCATION_SIZE * 32 */
#define CURBE_URB_ENTRY_LENGTH  4               /* in 256-bit, it should be less than or equal to CURBE_TOTAL_DATA_LENGTH / 32 */

enum VIDEO_CODING_TYPE{
    VIDEO_CODING_AVC = 0,
    VIDEO_CODING_MPEG2,
    VIDEO_CODING_SUM
};

enum AVC_VME_KERNEL_TYPE{ 
    AVC_VME_INTRA_SHADER = 0,
    AVC_VME_INTER_SHADER,
    AVC_VME_BATCHBUFFER,
    AVC_VME_BINTER_SHADER,
    AVC_VME_KERNEL_SUM
};

enum MPEG2_VME_KERNEL_TYPE{
    MPEG2_VME_INTER_SHADER = 0,
    MPEG2_VME_BATCHBUFFER,
    MPEG2_VME_KERNEL_SUM
};
 

static const uint32_t gen7_vme_intra_frame[][4] = {
#include "shaders/vme/intra_frame_ivb.g7b"
};

static const uint32_t gen7_vme_inter_frame[][4] = {
#include "shaders/vme/inter_frame_ivb.g7b"
};

static const uint32_t gen7_vme_batchbuffer[][4] = {
#include "shaders/vme/batchbuffer.g7b"
};

static const uint32_t gen7_vme_binter_frame[][4] = {
#include "shaders/vme/inter_bframe_ivb.g7b"
};

static struct i965_kernel gen7_vme_kernels[] = {
    {
        "AVC VME Intra Frame",
        AVC_VME_INTRA_SHADER,			/*index*/
        gen7_vme_intra_frame, 			
        sizeof(gen7_vme_intra_frame),		
        NULL
    },
    {
        "AVC VME inter Frame",
        AVC_VME_INTER_SHADER,
        gen7_vme_inter_frame,
        sizeof(gen7_vme_inter_frame),
        NULL
    },
    {
        "AVC VME BATCHBUFFER",
        AVC_VME_BATCHBUFFER,
        gen7_vme_batchbuffer,
        sizeof(gen7_vme_batchbuffer),
        NULL
    },
    {
        "AVC VME binter Frame",
        AVC_VME_BINTER_SHADER,
        gen7_vme_binter_frame,
        sizeof(gen7_vme_binter_frame),
        NULL
    }
};

static const uint32_t gen7_vme_mpeg2_inter_frame[][4] = {
#include "shaders/vme/mpeg2_inter_ivb.g7b"
};

static const uint32_t gen7_vme_mpeg2_batchbuffer[][4] = {
#include "shaders/vme/batchbuffer.g7b"
};

static struct i965_kernel gen7_vme_mpeg2_kernels[] = {
    {
        "MPEG2 VME inter Frame",
        MPEG2_VME_INTER_SHADER,
        gen7_vme_mpeg2_inter_frame,
        sizeof(gen7_vme_mpeg2_inter_frame),
        NULL
    },
    {
        "MPEG2 VME BATCHBUFFER",
        MPEG2_VME_BATCHBUFFER,
        gen7_vme_mpeg2_batchbuffer,
        sizeof(gen7_vme_mpeg2_batchbuffer),
        NULL
    },
};

/* only used for VME source surface state */
static void 
gen7_vme_source_surface_state(VADriverContextP ctx,
                              int index,
                              struct object_surface *obj_surface,
                              struct intel_encoder_context *encoder_context)
{
    struct gen6_vme_context *vme_context = encoder_context->vme_context;

    vme_context->vme_surface2_setup(ctx,
                                    &vme_context->gpe_context,
                                    obj_surface,
                                    BINDING_TABLE_OFFSET(index),
                                    SURFACE_STATE_OFFSET(index));
}

static void
gen7_vme_media_source_surface_state(VADriverContextP ctx,
                                    int index,
                                    struct object_surface *obj_surface,
                                    struct intel_encoder_context *encoder_context)
{
    struct gen6_vme_context *vme_context = encoder_context->vme_context;

    vme_context->vme_media_rw_surface_setup(ctx,
                                            &vme_context->gpe_context,
                                            obj_surface,
                                            BINDING_TABLE_OFFSET(index),
                                            SURFACE_STATE_OFFSET(index),
                                            0);
}

static void
gen7_vme_output_buffer_setup(VADriverContextP ctx,
                             struct encode_state *encode_state,
                             int index,
                             struct intel_encoder_context *encoder_context)

{
    struct i965_driver_data *i965 = i965_driver_data(ctx);
    struct gen6_vme_context *vme_context = encoder_context->vme_context;
    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
    VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
    int is_intra = pSliceParameter->slice_type == SLICE_TYPE_I;
    int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
    int height_in_mbs = pSequenceParameter->picture_height_in_mbs;

    vme_context->vme_output.num_blocks = width_in_mbs * height_in_mbs;
    vme_context->vme_output.pitch = 16; /* in bytes, always 16 */

    if (is_intra)
        vme_context->vme_output.size_block = INTRA_VME_OUTPUT_IN_BYTES;
    else
        vme_context->vme_output.size_block = INTER_VME_OUTPUT_IN_BYTES;

    vme_context->vme_output.bo = dri_bo_alloc(i965->intel.bufmgr, 
                                              "VME output buffer",
                                              vme_context->vme_output.num_blocks * vme_context->vme_output.size_block,
                                              0x1000);
    assert(vme_context->vme_output.bo);
    vme_context->vme_buffer_suface_setup(ctx,
                                         &vme_context->gpe_context,
                                         &vme_context->vme_output,
                                         BINDING_TABLE_OFFSET(index),
                                         SURFACE_STATE_OFFSET(index));
}

static void
gen7_vme_output_vme_batchbuffer_setup(VADriverContextP ctx,
                                      struct encode_state *encode_state,
                                      int index,
                                      struct intel_encoder_context *encoder_context)

{
    struct i965_driver_data *i965 = i965_driver_data(ctx);
    struct gen6_vme_context *vme_context = encoder_context->vme_context;
    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
    int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
    int height_in_mbs = pSequenceParameter->picture_height_in_mbs;

    vme_context->vme_batchbuffer.num_blocks = width_in_mbs * height_in_mbs + 1;
    vme_context->vme_batchbuffer.size_block = 64; /* 4 OWORDs */
    vme_context->vme_batchbuffer.pitch = 16;
    vme_context->vme_batchbuffer.bo = dri_bo_alloc(i965->intel.bufmgr, 
                                                   "VME batchbuffer",
                                                   vme_context->vme_batchbuffer.num_blocks * vme_context->vme_batchbuffer.size_block,
                                                   0x1000);
    vme_context->vme_buffer_suface_setup(ctx,
                                         &vme_context->gpe_context,
                                         &vme_context->vme_batchbuffer,
                                         BINDING_TABLE_OFFSET(index),
                                         SURFACE_STATE_OFFSET(index));
}

static VAStatus
gen7_vme_surface_setup(VADriverContextP ctx, 
                       struct encode_state *encode_state,
                       int is_intra,
                       struct intel_encoder_context *encoder_context)
{
    struct object_surface *obj_surface;

    /*Setup surfaces state*/
    /* current picture for encoding */
    obj_surface = encode_state->input_yuv_object;
    gen7_vme_source_surface_state(ctx, 0, obj_surface, encoder_context);
    gen7_vme_media_source_surface_state(ctx, 4, obj_surface, encoder_context);

    if (!is_intra) {
	VAEncSliceParameterBufferH264 *slice_param = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
	int slice_type;

	slice_type = intel_avc_enc_slice_type_fixup(slice_param->slice_type);
        assert(slice_type != SLICE_TYPE_I && slice_type != SLICE_TYPE_SI);

        intel_avc_vme_reference_state(ctx, encode_state, encoder_context, 0, 1, gen7_vme_source_surface_state);

	if (slice_type == SLICE_TYPE_B)
            intel_avc_vme_reference_state(ctx, encode_state, encoder_context, 1, 2, gen7_vme_source_surface_state);
    }

    /* VME output */
    gen7_vme_output_buffer_setup(ctx, encode_state, 3, encoder_context);
    gen7_vme_output_vme_batchbuffer_setup(ctx, encode_state, 5, encoder_context);
    intel_h264_setup_cost_surface(ctx, encode_state, encoder_context,
                                  BINDING_TABLE_OFFSET(INTEL_COST_TABLE_OFFSET),
                                  SURFACE_STATE_OFFSET(INTEL_COST_TABLE_OFFSET));

    return VA_STATUS_SUCCESS;
}

static VAStatus gen7_vme_interface_setup(VADriverContextP ctx, 
                                         struct encode_state *encode_state,
                                         struct intel_encoder_context *encoder_context)
{
    struct gen6_vme_context *vme_context = encoder_context->vme_context;
    struct gen6_interface_descriptor_data *desc;   
    int i;
    dri_bo *bo;

    bo = vme_context->gpe_context.idrt.bo;
    dri_bo_map(bo, 1);
    assert(bo->virtual);
    desc = bo->virtual;

    for (i = 0; i < vme_context->vme_kernel_sum; i++) {
        struct i965_kernel *kernel;
        kernel = &vme_context->gpe_context.kernels[i];
        assert(sizeof(*desc) == 32);
        /*Setup the descritor table*/
        memset(desc, 0, sizeof(*desc));
        desc->desc0.kernel_start_pointer = (kernel->bo->offset >> 6);
        desc->desc2.sampler_count = 1; /* FIXME: */
        desc->desc2.sampler_state_pointer = (vme_context->vme_state.bo->offset >> 5);
        desc->desc3.binding_table_entry_count = 1; /* FIXME: */
        desc->desc3.binding_table_pointer = (BINDING_TABLE_OFFSET(0) >> 5);
        desc->desc4.constant_urb_entry_read_offset = 0;
        desc->desc4.constant_urb_entry_read_length = CURBE_URB_ENTRY_LENGTH;
 		
        /*kernel start*/
        dri_bo_emit_reloc(bo,	
                          I915_GEM_DOMAIN_INSTRUCTION, 0,
                          0,
                          i * sizeof(*desc) + offsetof(struct gen6_interface_descriptor_data, desc0),
                          kernel->bo);
        /*Sampler State(VME state pointer)*/
        dri_bo_emit_reloc(bo,
                          I915_GEM_DOMAIN_INSTRUCTION, 0,
                          (1 << 2),									//
                          i * sizeof(*desc) + offsetof(struct gen6_interface_descriptor_data, desc2),
                          vme_context->vme_state.bo);
        desc++;
    }
    dri_bo_unmap(bo);

    return VA_STATUS_SUCCESS;
}

static VAStatus gen7_vme_constant_setup(VADriverContextP ctx, 
                                        struct encode_state *encode_state,
                                        struct intel_encoder_context *encoder_context)
{
    struct gen6_vme_context *vme_context = encoder_context->vme_context;
    unsigned char *constant_buffer;
    unsigned int *vme_state_message;
    int mv_num;

    vme_state_message = (unsigned int *)vme_context->vme_state_message;
    mv_num = 32;

    if (encoder_context->codec == CODEC_H264) {
        if (vme_context->h264_level >= 30) {
            mv_num = 16;
        
            if (vme_context->h264_level >= 31)
                mv_num = 8;
        }
    } else if (encoder_context->codec == CODEC_MPEG2) { 
        mv_num = 2;
    }


    vme_state_message[31] = mv_num;

    dri_bo_map(vme_context->gpe_context.curbe.bo, 1);
    assert(vme_context->gpe_context.curbe.bo->virtual);
    constant_buffer = vme_context->gpe_context.curbe.bo->virtual;

    /* Pass the required constant info into the constant buffer */
    memcpy(constant_buffer, (char *)vme_context->vme_state_message, 128);
	
    dri_bo_unmap( vme_context->gpe_context.curbe.bo);

    return VA_STATUS_SUCCESS;
}


static VAStatus gen7_vme_avc_state_setup(VADriverContextP ctx,
                                         struct encode_state *encode_state,
                                         int is_intra,
                                         struct intel_encoder_context *encoder_context)
{
    struct gen6_vme_context *vme_context = encoder_context->vme_context;
    unsigned int *vme_state_message;
    unsigned int *mb_cost_table;
    int i;
    VAEncSliceParameterBufferH264 *slice_param = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
    unsigned int is_low_quality = (encoder_context->quality_level == ENCODER_LOW_QUALITY);
    dri_bo *cost_bo;
    int slice_type;
    uint8_t *cost_ptr;
    int qp;

    slice_type = intel_avc_enc_slice_type_fixup(slice_param->slice_type);

    if (slice_type == SLICE_TYPE_I) {
        cost_bo = vme_context->i_qp_cost_table;
    } else if (slice_type == SLICE_TYPE_P) {
        cost_bo = vme_context->p_qp_cost_table;
    } else {
        cost_bo = vme_context->b_qp_cost_table;
    }

    mb_cost_table = (unsigned int *)vme_context->vme_state_message;
    dri_bo_map(vme_context->vme_state.bo, 1);
    dri_bo_map(cost_bo, 0);
    assert(vme_context->vme_state.bo->virtual);
    assert(cost_bo->virtual);
    vme_state_message = (unsigned int *)vme_context->vme_state.bo->virtual;

    cost_ptr = (uint8_t *)cost_bo->virtual;

    /* up to 8 VME_SEARCH_PATH_LUT is supported */
    /* Two subsequent qp will share the same mode/motion-vector cost table */
    /* the range is from 0-51 */
    for (i = 0; i < 8; i++)  {

        vme_state_message = (unsigned int *)vme_context->vme_state.bo->virtual +
                             i * 32;
        if ((slice_type == SLICE_TYPE_P) && !is_low_quality) {
            vme_state_message[0] = 0x01010101;
            vme_state_message[1] = 0x10010101;
            vme_state_message[2] = 0x0F0F0F0F;
            vme_state_message[3] = 0x100F0F0F;
            vme_state_message[4] = 0x01010101;
            vme_state_message[5] = 0x10010101;
            vme_state_message[6] = 0x0F0F0F0F;
            vme_state_message[7] = 0x100F0F0F;
            vme_state_message[8] = 0x01010101;
            vme_state_message[9] = 0x10010101;
            vme_state_message[10] = 0x0F0F0F0F;
            vme_state_message[11] = 0x000F0F0F;
            vme_state_message[12] = 0x00;
            vme_state_message[13] = 0x00;
        } else {
            vme_state_message[0] = 0x10010101;
            vme_state_message[1] = 0x100F0F0F;
            vme_state_message[2] = 0x10010101;
            vme_state_message[3] = 0x000F0F0F;
            vme_state_message[4] = 0;
            vme_state_message[5] = 0;
            vme_state_message[6] = 0;
            vme_state_message[7] = 0;
            vme_state_message[8] = 0;
            vme_state_message[9] = 0;
            vme_state_message[10] = 0;
            vme_state_message[11] = 0;
            vme_state_message[12] = 0;
            vme_state_message[13] = 0;
        }

        qp = 8 * i;

        /* when qp is greater than 51, use the cost_table of qp=51 to fulfill */
        if (qp > 51) {
            qp = 51;
        }
        /* Setup the four LUT sets for MbMV cost */
        mb_cost_table = (unsigned int *)(cost_ptr + qp * 32);
        vme_state_message[14] = (mb_cost_table[2] & 0xFFFF);
        vme_state_message[16] = mb_cost_table[0];
        vme_state_message[17] = mb_cost_table[1];
        vme_state_message[18] = mb_cost_table[3];
        vme_state_message[19] = mb_cost_table[4];

        qp += 2;
        if (qp > 51) {
            qp = 51;
        }
        mb_cost_table = (unsigned int *)(cost_ptr + qp * 32);
        vme_state_message[14] |= ((mb_cost_table[2] & 0xFFFF) << 16);
        vme_state_message[20] = mb_cost_table[0];
        vme_state_message[21] = mb_cost_table[1];
        vme_state_message[22] = mb_cost_table[3];
        vme_state_message[23] = mb_cost_table[4];

        qp += 2;
        if (qp > 51) {
            qp = 51;
        }
        vme_state_message[15] = (mb_cost_table[2] & 0xFFFF);
        vme_state_message[24] = mb_cost_table[0];
        vme_state_message[25] = mb_cost_table[1];
        vme_state_message[26] = mb_cost_table[3];
        vme_state_message[27] = mb_cost_table[4];

        qp += 2;
        if (qp > 51) {
            qp = 51;
        }
        mb_cost_table = (unsigned int *)(cost_ptr + qp * 32);
        vme_state_message[15] |= ((mb_cost_table[2] & 0xFFFF) << 16);
        vme_state_message[28] = mb_cost_table[0];
        vme_state_message[29] = mb_cost_table[1];
        vme_state_message[30] = mb_cost_table[3];
        vme_state_message[31] = mb_cost_table[4];
    }

    dri_bo_unmap(cost_bo);
    dri_bo_unmap( vme_context->vme_state.bo);
    return VA_STATUS_SUCCESS;
}

static VAStatus gen7_vme_mpeg2_state_setup(VADriverContextP ctx,
                                           struct encode_state *encode_state,
                                           int is_intra,
                                           struct intel_encoder_context *encoder_context)
{
    struct gen6_vme_context *vme_context = encoder_context->vme_context;
    unsigned int *vme_state_message;
    int i;
    unsigned int *mb_cost_table;

    mb_cost_table = (unsigned int *)vme_context->vme_state_message;
	
    //building VME state message
    dri_bo_map(vme_context->vme_state.bo, 1);
    assert(vme_context->vme_state.bo->virtual);
    vme_state_message = (unsigned int *)vme_context->vme_state.bo->virtual;

    vme_state_message[0] = 0x01010101;
    vme_state_message[1] = 0x10010101;
    vme_state_message[2] = 0x0F0F0F0F;
    vme_state_message[3] = 0x100F0F0F;
    vme_state_message[4] = 0x01010101;
    vme_state_message[5] = 0x10010101;
    vme_state_message[6] = 0x0F0F0F0F;
    vme_state_message[7] = 0x100F0F0F;
    vme_state_message[8] = 0x01010101;
    vme_state_message[9] = 0x10010101;
    vme_state_message[10] = 0x0F0F0F0F;
    vme_state_message[11] = 0x000F0F0F;
    vme_state_message[12] = 0x00;
    vme_state_message[13] = 0x00;

    vme_state_message[14] = (mb_cost_table[2] & 0xFFFF);
    vme_state_message[15] = 0;
    vme_state_message[16] = mb_cost_table[0];
    vme_state_message[17] = 0;
    vme_state_message[18] = mb_cost_table[3];
    vme_state_message[19] = mb_cost_table[4];

    for(i = 20; i < 32; i++) {
        vme_state_message[i] = 0;
    }
    //vme_state_message[16] = 0x42424242;			//cost function LUT set 0 for Intra

    dri_bo_unmap( vme_context->vme_state.bo);
    return VA_STATUS_SUCCESS;
}

static void
gen7_vme_fill_vme_batchbuffer(VADriverContextP ctx, 
                              struct encode_state *encode_state,
                              int mb_width, int mb_height,
                              int kernel,
                              int transform_8x8_mode_flag,
                              struct intel_encoder_context *encoder_context)
{
    struct gen6_vme_context *vme_context = encoder_context->vme_context;
    int mb_x = 0, mb_y = 0;
    int i, s, j;
    unsigned int *command_ptr;
    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
    VAEncPictureParameterBufferH264 *pic_param = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
    VAEncSliceParameterBufferH264 *slice_param = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
    int qp;
    int slice_type = intel_avc_enc_slice_type_fixup(slice_param->slice_type);
    int qp_mb, qp_index;

    if (encoder_context->rate_control_mode == VA_RC_CQP)
        qp = pic_param->pic_init_qp + slice_param->slice_qp_delta;
    else
        qp = mfc_context->brc.qp_prime_y[encoder_context->layer.curr_frame_layer_id][slice_type];

    dri_bo_map(vme_context->vme_batchbuffer.bo, 1);
    command_ptr = vme_context->vme_batchbuffer.bo->virtual;

    for (s = 0; s < encode_state->num_slice_params_ext; s++) {
        VAEncSliceParameterBufferMPEG2 *slice_param = (VAEncSliceParameterBufferMPEG2 *)encode_state->slice_params_ext[s]->buffer;

        for (j = 0; j < encode_state->slice_params_ext[s]->num_elements; j++) {
            int slice_mb_begin = slice_param->macroblock_address;
            int slice_mb_number = slice_param->num_macroblocks;
            unsigned int mb_intra_ub;
            int slice_mb_x = slice_param->macroblock_address % mb_width;

            for (i = 0; i < slice_mb_number;) {
                int mb_count = i + slice_mb_begin;    

                mb_x = mb_count % mb_width;
                mb_y = mb_count / mb_width;
                mb_intra_ub = 0;

                if (mb_x != 0) {
                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
                }

                if (mb_y != 0) {
                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;

                    if (mb_x != 0)
                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;

                    if (mb_x != (mb_width -1))
                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
                }

                if (i < mb_width) {
                    if (i == 0)
                        mb_intra_ub &= ~(INTRA_PRED_AVAIL_FLAG_AE);

                    mb_intra_ub &= ~(INTRA_PRED_AVAIL_FLAG_BCD_MASK);

                    if ((i == (mb_width - 1)) && slice_mb_x) {
                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
                    }
                }
		
                if ((i == mb_width) && slice_mb_x) {
                    mb_intra_ub &= ~(INTRA_PRED_AVAIL_FLAG_D);
                }

                *command_ptr++ = (CMD_MEDIA_OBJECT | (9 - 2));
                *command_ptr++ = kernel;
                *command_ptr++ = 0;
                *command_ptr++ = 0;
                *command_ptr++ = 0;
                *command_ptr++ = 0;
   
                /*inline data */
                *command_ptr++ = (mb_width << 16 | mb_y << 8 | mb_x);
                *command_ptr++ = ((encoder_context->quality_level << 24) | (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));

                if (vme_context->roi_enabled) {
                    qp_index = mb_y * mb_width + mb_x;
                    qp_mb = *(vme_context->qp_per_mb + qp_index);
                } else
                    qp_mb = qp;
                *command_ptr++ = qp_mb;

                i += 1;
            }

            slice_param++;
        }
    }

    *command_ptr++ = 0;
    *command_ptr++ = MI_BATCH_BUFFER_END;

    dri_bo_unmap(vme_context->vme_batchbuffer.bo);
}


static void gen7_vme_media_init(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
{
    struct i965_driver_data *i965 = i965_driver_data(ctx);
    struct gen6_vme_context *vme_context = encoder_context->vme_context;
    dri_bo *bo;

    i965_gpe_context_init(ctx, &vme_context->gpe_context);

    /* VME output buffer */
    dri_bo_unreference(vme_context->vme_output.bo);
    vme_context->vme_output.bo = NULL;

    dri_bo_unreference(vme_context->vme_batchbuffer.bo);
    vme_context->vme_batchbuffer.bo = NULL;

    /* VME state */
    dri_bo_unreference(vme_context->vme_state.bo);
    bo = dri_bo_alloc(i965->intel.bufmgr,
                      "Buffer",
                      1024*16, 64);
    assert(bo);
    vme_context->vme_state.bo = bo;
}

static void gen7_vme_pipeline_programing(VADriverContextP ctx, 
                                         struct encode_state *encode_state,
                                         struct intel_encoder_context *encoder_context)
{
    struct gen6_vme_context *vme_context = encoder_context->vme_context;
    struct intel_batchbuffer *batch = encoder_context->base.batch;
    VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
    VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
    int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
    int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
    int s;
    bool allow_hwscore = true;
    int kernel_shader;
    unsigned int is_low_quality = (encoder_context->quality_level == ENCODER_LOW_QUALITY);

    if (is_low_quality)
        allow_hwscore = false;
    else {
        for (s = 0; s < encode_state->num_slice_params_ext; s++) {
            pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[s]->buffer; 
            if ((pSliceParameter->macroblock_address % width_in_mbs)) {
                allow_hwscore = false;
                break;
            }
        }
    }

    if ((pSliceParameter->slice_type == SLICE_TYPE_I) ||
	(pSliceParameter->slice_type == SLICE_TYPE_SI)) {
	kernel_shader = AVC_VME_INTRA_SHADER;
    } else if ((pSliceParameter->slice_type == SLICE_TYPE_P) ||
               (pSliceParameter->slice_type == SLICE_TYPE_SP)) {
	kernel_shader = AVC_VME_INTER_SHADER;
    } else {
	kernel_shader = AVC_VME_BINTER_SHADER;
	if (!allow_hwscore)
            kernel_shader = AVC_VME_INTER_SHADER;
    }

    if (allow_hwscore)
	gen7_vme_walker_fill_vme_batchbuffer(ctx, 
                                             encode_state,
                                             width_in_mbs, height_in_mbs,
                                             kernel_shader,
                                             pPicParameter->pic_fields.bits.transform_8x8_mode_flag,
                                             encoder_context);
	
    else
	gen7_vme_fill_vme_batchbuffer(ctx, 
                                      encode_state,
                                      width_in_mbs, height_in_mbs,
                                      kernel_shader,
                                      pPicParameter->pic_fields.bits.transform_8x8_mode_flag,
                                      encoder_context);

    intel_batchbuffer_start_atomic(batch, 0x1000);
    gen6_gpe_pipeline_setup(ctx, &vme_context->gpe_context, batch);
    BEGIN_BATCH(batch, 2);
    OUT_BATCH(batch, MI_BATCH_BUFFER_START | (1 << 8));
    OUT_RELOC(batch,
              vme_context->vme_batchbuffer.bo,
              I915_GEM_DOMAIN_COMMAND, 0, 
              0);
    ADVANCE_BATCH(batch);

    intel_batchbuffer_end_atomic(batch);	
}

static VAStatus gen7_vme_prepare(VADriverContextP ctx, 
                                 struct encode_state *encode_state,
                                 struct intel_encoder_context *encoder_context)
{
    VAStatus vaStatus = VA_STATUS_SUCCESS;
    VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
    int is_intra = pSliceParameter->slice_type == SLICE_TYPE_I;
    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
    struct gen6_vme_context *vme_context = encoder_context->vme_context;

    if (!vme_context->h264_level ||
        (vme_context->h264_level != pSequenceParameter->level_idc)) {
	vme_context->h264_level = pSequenceParameter->level_idc;	
    }

    intel_vme_update_mbmv_cost(ctx, encode_state, encoder_context);
    intel_h264_initialize_mbmv_cost(ctx, encode_state, encoder_context);

    /*Setup all the memory object*/
    gen7_vme_surface_setup(ctx, encode_state, is_intra, encoder_context);
    gen7_vme_interface_setup(ctx, encode_state, encoder_context);
    gen7_vme_constant_setup(ctx, encode_state, encoder_context);
    gen7_vme_avc_state_setup(ctx, encode_state, is_intra, encoder_context);

    /*Programing media pipeline*/
    gen7_vme_pipeline_programing(ctx, encode_state, encoder_context);

    return vaStatus;
}

static VAStatus gen7_vme_run(VADriverContextP ctx, 
                             struct encode_state *encode_state,
                             struct intel_encoder_context *encoder_context)
{
    struct intel_batchbuffer *batch = encoder_context->base.batch;

    intel_batchbuffer_flush(batch);

    return VA_STATUS_SUCCESS;
}

static VAStatus gen7_vme_stop(VADriverContextP ctx, 
                              struct encode_state *encode_state,
                              struct intel_encoder_context *encoder_context)
{
    return VA_STATUS_SUCCESS;
}

static VAStatus
gen7_vme_pipeline(VADriverContextP ctx,
                  VAProfile profile,
                  struct encode_state *encode_state,
                  struct intel_encoder_context *encoder_context)
{
    gen7_vme_media_init(ctx, encoder_context);
    gen7_vme_prepare(ctx, encode_state, encoder_context);
    gen7_vme_run(ctx, encode_state, encoder_context);
    gen7_vme_stop(ctx, encode_state, encoder_context);

    return VA_STATUS_SUCCESS;
}

static void
gen7_vme_mpeg2_output_buffer_setup(VADriverContextP ctx,
                                   struct encode_state *encode_state,
                                   int index,
                                   int is_intra,
                                   struct intel_encoder_context *encoder_context)

{
    struct i965_driver_data *i965 = i965_driver_data(ctx);
    struct gen6_vme_context *vme_context = encoder_context->vme_context;
    VAEncSequenceParameterBufferMPEG2 *seq_param = (VAEncSequenceParameterBufferMPEG2 *)encode_state->seq_param_ext->buffer;
    int width_in_mbs = ALIGN(seq_param->picture_width, 16) / 16;
    int height_in_mbs = ALIGN(seq_param->picture_height, 16) / 16;

    vme_context->vme_output.num_blocks = width_in_mbs * height_in_mbs;
    vme_context->vme_output.pitch = 16; /* in bytes, always 16 */

    if (is_intra)
        vme_context->vme_output.size_block = INTRA_VME_OUTPUT_IN_BYTES;
    else
        vme_context->vme_output.size_block = INTER_VME_OUTPUT_IN_BYTES;

    vme_context->vme_output.bo = dri_bo_alloc(i965->intel.bufmgr,
                                              "VME output buffer",
                                              vme_context->vme_output.num_blocks * vme_context->vme_output.size_block,
                                              0x1000);
    assert(vme_context->vme_output.bo);
    vme_context->vme_buffer_suface_setup(ctx,
                                         &vme_context->gpe_context,
                                         &vme_context->vme_output,
                                         BINDING_TABLE_OFFSET(index),
                                         SURFACE_STATE_OFFSET(index));
}

static void
gen7_vme_mpeg2_output_vme_batchbuffer_setup(VADriverContextP ctx,
                                            struct encode_state *encode_state,
                                            int index,
                                            struct intel_encoder_context *encoder_context)

{
    struct i965_driver_data *i965 = i965_driver_data(ctx);
    struct gen6_vme_context *vme_context = encoder_context->vme_context;
    VAEncSequenceParameterBufferMPEG2 *seq_param = (VAEncSequenceParameterBufferMPEG2 *)encode_state->seq_param_ext->buffer;
    int width_in_mbs = ALIGN(seq_param->picture_width, 16) / 16;
    int height_in_mbs = ALIGN(seq_param->picture_height, 16) / 16;

    vme_context->vme_batchbuffer.num_blocks = width_in_mbs * height_in_mbs + 1;
    vme_context->vme_batchbuffer.size_block = 32; /* 4 OWORDs */
    vme_context->vme_batchbuffer.pitch = 16;
    vme_context->vme_batchbuffer.bo = dri_bo_alloc(i965->intel.bufmgr, 
                                                   "VME batchbuffer",
                                                   vme_context->vme_batchbuffer.num_blocks * vme_context->vme_batchbuffer.size_block,
                                                   0x1000);
    vme_context->vme_buffer_suface_setup(ctx,
                                         &vme_context->gpe_context,
                                         &vme_context->vme_batchbuffer,
                                         BINDING_TABLE_OFFSET(index),
                                         SURFACE_STATE_OFFSET(index));
}

static VAStatus
gen7_vme_mpeg2_surface_setup(VADriverContextP ctx, 
                             struct encode_state *encode_state,
                             int is_intra,
                             struct intel_encoder_context *encoder_context)
{
    struct object_surface *obj_surface;

    /*Setup surfaces state*/
    /* current picture for encoding */
    obj_surface = encode_state->input_yuv_object;
    gen7_vme_source_surface_state(ctx, 0, obj_surface, encoder_context);
    gen7_vme_media_source_surface_state(ctx, 4, obj_surface, encoder_context);

    if (!is_intra) {
        /* reference 0 */
        obj_surface = encode_state->reference_objects[0];
        if (obj_surface->bo != NULL)
            gen7_vme_source_surface_state(ctx, 1, obj_surface, encoder_context);

        /* reference 1 */
        obj_surface = encode_state->reference_objects[1];
        if (obj_surface && obj_surface->bo != NULL) 
            gen7_vme_source_surface_state(ctx, 2, obj_surface, encoder_context);
    }

    /* VME output */
    gen7_vme_mpeg2_output_buffer_setup(ctx, encode_state, 3, is_intra, encoder_context);
    gen7_vme_mpeg2_output_vme_batchbuffer_setup(ctx, encode_state, 5, encoder_context);

    return VA_STATUS_SUCCESS;
}

static void
gen7_vme_mpeg2_fill_vme_batchbuffer(VADriverContextP ctx,
                                    struct encode_state *encode_state,
                                    int mb_width, int mb_height,
                                    int kernel,
                                    int transform_8x8_mode_flag,
                                    struct intel_encoder_context *encoder_context)
{
    struct gen6_vme_context *vme_context = encoder_context->vme_context;
    int mb_x = 0, mb_y = 0;
    int i, s, j;
    unsigned int *command_ptr;

    dri_bo_map(vme_context->vme_batchbuffer.bo, 1);
    command_ptr = vme_context->vme_batchbuffer.bo->virtual;

    for (s = 0; s < encode_state->num_slice_params_ext; s++) {
        VAEncSliceParameterBufferMPEG2 *slice_param = (VAEncSliceParameterBufferMPEG2 *)encode_state->slice_params_ext[s]->buffer;

        for (j = 0; j < encode_state->slice_params_ext[s]->num_elements; j++) {
            int slice_mb_begin = slice_param->macroblock_address;
            int slice_mb_number = slice_param->num_macroblocks;
            unsigned int mb_intra_ub;

            for (i = 0; i < slice_mb_number;) {
                int mb_count = i + slice_mb_begin;    

                mb_x = mb_count % mb_width;
                mb_y = mb_count / mb_width;
                mb_intra_ub = 0;

                if (mb_x != 0) {
                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
                }

                if (mb_y != 0) {
                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;

                    if (mb_x != 0)
                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;

                    if (mb_x != (mb_width -1))
                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
                }

		

                *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
                *command_ptr++ = kernel;
                *command_ptr++ = 0;
                *command_ptr++ = 0;
                *command_ptr++ = 0;
                *command_ptr++ = 0;
   
                /*inline data */
                *command_ptr++ = (mb_width << 16 | mb_y << 8 | mb_x);
                *command_ptr++ = ( (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));

                i += 1;
            }

            slice_param++;
        }
    }

    *command_ptr++ = 0;
    *command_ptr++ = MI_BATCH_BUFFER_END;

    dri_bo_unmap(vme_context->vme_batchbuffer.bo);
}

static void
gen7_vme_mpeg2_pipeline_programing(VADriverContextP ctx, 
                                   struct encode_state *encode_state,
                                   int is_intra,
                                   struct intel_encoder_context *encoder_context)
{
    struct gen6_vme_context *vme_context = encoder_context->vme_context;
    struct intel_batchbuffer *batch = encoder_context->base.batch;
    VAEncSequenceParameterBufferMPEG2 *seq_param = (VAEncSequenceParameterBufferMPEG2 *)encode_state->seq_param_ext->buffer;
    int width_in_mbs = ALIGN(seq_param->picture_width, 16) / 16;
    int height_in_mbs = ALIGN(seq_param->picture_height, 16) / 16;

    bool allow_hwscore = true;
    int s;

    for (s = 0; s < encode_state->num_slice_params_ext; s++) {
	int j;
        VAEncSliceParameterBufferMPEG2 *slice_param = (VAEncSliceParameterBufferMPEG2 *)encode_state->slice_params_ext[s]->buffer;

        for (j = 0; j < encode_state->slice_params_ext[s]->num_elements; j++) {
	    if (slice_param->macroblock_address % width_in_mbs) {
		allow_hwscore = false;
		break;
	    }
	}
    }

    if (allow_hwscore) 
	gen7_vme_mpeg2_walker_fill_vme_batchbuffer(ctx,
                                                   encode_state,
                                                   width_in_mbs, height_in_mbs,
                                                   MPEG2_VME_INTER_SHADER,
                                                   encoder_context);
    else
    	gen7_vme_mpeg2_fill_vme_batchbuffer(ctx, 
                                            encode_state,
                                            width_in_mbs, height_in_mbs,
                                            MPEG2_VME_INTER_SHADER,
                                            0,
                                            encoder_context);

    intel_batchbuffer_start_atomic(batch, 0x1000);
    gen6_gpe_pipeline_setup(ctx, &vme_context->gpe_context, batch);
    BEGIN_BATCH(batch, 2);
    OUT_BATCH(batch, MI_BATCH_BUFFER_START | (1 << 8));
    OUT_RELOC(batch,
              vme_context->vme_batchbuffer.bo,
              I915_GEM_DOMAIN_COMMAND, 0, 
              0);
    ADVANCE_BATCH(batch);

    intel_batchbuffer_end_atomic(batch);
}

static VAStatus
gen7_vme_mpeg2_prepare(VADriverContextP ctx, 
                       struct encode_state *encode_state,
                       struct intel_encoder_context *encoder_context)
{
    VAStatus vaStatus = VA_STATUS_SUCCESS;
    VAEncSequenceParameterBufferMPEG2 *seq_param = (VAEncSequenceParameterBufferMPEG2 *)encode_state->seq_param_ext->buffer;
    struct gen6_vme_context *vme_context = encoder_context->vme_context;

    if ((!vme_context->mpeg2_level) ||
        (vme_context->mpeg2_level != (seq_param->sequence_extension.bits.profile_and_level_indication & MPEG2_LEVEL_MASK))) {
	vme_context->mpeg2_level = seq_param->sequence_extension.bits.profile_and_level_indication & MPEG2_LEVEL_MASK;
    }

    /*Setup all the memory object*/

    intel_vme_mpeg2_state_setup(ctx, encode_state, encoder_context);
    gen7_vme_mpeg2_surface_setup(ctx, encode_state, 0, encoder_context);
    gen7_vme_interface_setup(ctx, encode_state, encoder_context);
    gen7_vme_constant_setup(ctx, encode_state, encoder_context);
    gen7_vme_mpeg2_state_setup(ctx, encode_state, 0, encoder_context);

    /*Programing media pipeline*/
    gen7_vme_mpeg2_pipeline_programing(ctx, encode_state, 0, encoder_context);

    return vaStatus;
}

static VAStatus
gen7_vme_mpeg2_pipeline(VADriverContextP ctx,
                        VAProfile profile,
                        struct encode_state *encode_state,
                        struct intel_encoder_context *encoder_context)
{
    struct i965_driver_data *i965 = i965_driver_data(ctx);
    struct gen6_vme_context *vme_context = encoder_context->vme_context;
    VAEncSliceParameterBufferMPEG2 *slice_param = 
        (VAEncSliceParameterBufferMPEG2 *)encode_state->slice_params_ext[0]->buffer;
    VAEncSequenceParameterBufferMPEG2 *seq_param = 
        (VAEncSequenceParameterBufferMPEG2 *)encode_state->seq_param_ext->buffer;
 
    /*No need of to exec VME for Intra slice */
    if (slice_param->is_intra_slice) {
        if(!vme_context->vme_output.bo) {
            int w_in_mbs = ALIGN(seq_param->picture_width, 16) / 16;
            int h_in_mbs = ALIGN(seq_param->picture_height, 16) / 16;

            vme_context->vme_output.num_blocks = w_in_mbs * h_in_mbs;
            vme_context->vme_output.pitch = 16; /* in bytes, always 16 */
            vme_context->vme_output.size_block = INTRA_VME_OUTPUT_IN_BYTES;
            vme_context->vme_output.bo = dri_bo_alloc(i965->intel.bufmgr,
                                                      "MPEG2 VME output buffer",
                                                      vme_context->vme_output.num_blocks
                                                      * vme_context->vme_output.size_block,
                                                      0x1000);
        }

        return VA_STATUS_SUCCESS;
    }

    gen7_vme_media_init(ctx, encoder_context);
    gen7_vme_mpeg2_prepare(ctx, encode_state, encoder_context);
    gen7_vme_run(ctx, encode_state, encoder_context);
    gen7_vme_stop(ctx, encode_state, encoder_context);

    return VA_STATUS_SUCCESS;
}

static void
gen7_vme_context_destroy(void *context)
{
    struct gen6_vme_context *vme_context = context;

    i965_gpe_context_destroy(&vme_context->gpe_context);

    dri_bo_unreference(vme_context->vme_output.bo);
    vme_context->vme_output.bo = NULL;

    dri_bo_unreference(vme_context->vme_state.bo);
    vme_context->vme_state.bo = NULL;

    dri_bo_unreference(vme_context->vme_batchbuffer.bo);
    vme_context->vme_batchbuffer.bo = NULL;

    free(vme_context->vme_state_message);
    vme_context->vme_state_message = NULL;

    dri_bo_unreference(vme_context->i_qp_cost_table);
    vme_context->i_qp_cost_table = NULL;

    dri_bo_unreference(vme_context->p_qp_cost_table);
    vme_context->p_qp_cost_table = NULL;

    dri_bo_unreference(vme_context->b_qp_cost_table);
    vme_context->b_qp_cost_table = NULL;

    free(vme_context->qp_per_mb);
    vme_context->qp_per_mb = NULL;

    free(vme_context);
}

Bool gen7_vme_context_init(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
{
    struct gen6_vme_context *vme_context = calloc(1, sizeof(struct gen6_vme_context));
    struct i965_kernel *vme_kernel_list = NULL;

    assert(vme_context);
    vme_context->gpe_context.surface_state_binding_table.length =
        (SURFACE_STATE_PADDED_SIZE + sizeof(unsigned int)) * MAX_MEDIA_SURFACES_GEN6;

    vme_context->gpe_context.idrt.max_entries = MAX_INTERFACE_DESC_GEN6;
    vme_context->gpe_context.idrt.entry_size = sizeof(struct gen6_interface_descriptor_data);
    vme_context->gpe_context.curbe.length = CURBE_TOTAL_DATA_LENGTH;

    vme_context->gpe_context.vfe_state.max_num_threads = 60 - 1;
    vme_context->gpe_context.vfe_state.num_urb_entries = 16;
    vme_context->gpe_context.vfe_state.gpgpu_mode = 0;
    vme_context->gpe_context.vfe_state.urb_entry_size = 59 - 1;
    vme_context->gpe_context.vfe_state.curbe_allocation_size = CURBE_ALLOCATION_SIZE - 1;

    gen7_vme_scoreboard_init(ctx, vme_context);

    if (encoder_context->codec == CODEC_H264) {
        vme_kernel_list = gen7_vme_kernels;
        vme_context->video_coding_type = VIDEO_CODING_AVC;
        vme_context->vme_kernel_sum = AVC_VME_KERNEL_SUM; 
        encoder_context->vme_pipeline = gen7_vme_pipeline; 
    } else if (encoder_context->codec == CODEC_MPEG2) {
        vme_kernel_list = gen7_vme_mpeg2_kernels;
        vme_context->video_coding_type = VIDEO_CODING_MPEG2;
        vme_context->vme_kernel_sum = MPEG2_VME_KERNEL_SUM;
        encoder_context->vme_pipeline = gen7_vme_mpeg2_pipeline;
    } else {
        /* Unsupported codec */
        assert(0);
    }

    i965_gpe_load_kernels(ctx,
                          &vme_context->gpe_context,
                          vme_kernel_list,
                          vme_context->vme_kernel_sum);

    vme_context->vme_surface2_setup = gen7_gpe_surface2_setup;
    vme_context->vme_media_rw_surface_setup = gen7_gpe_media_rw_surface_setup;
    vme_context->vme_buffer_suface_setup = gen7_gpe_buffer_suface_setup;

    encoder_context->vme_context = vme_context;
    encoder_context->vme_context_destroy = gen7_vme_context_destroy;
    vme_context->vme_state_message = malloc(VME_MSG_LENGTH * sizeof(int));

    return True;
}
