/*
   (c) Copyright 2000-2002  convergence integrated media GmbH.
   (c) Copyright 2002-2004  convergence GmbH.

   All rights reserved.

   Written by Denis Oliver Kropp <dok@directfb.org>,
              Andreas Hundt <andi@fischlustig.de>,
              Sven Neumann <neo@directfb.org> and
              Ville Syrjl <syrjala@sci.fi>.

   Scaling routines ported from gdk_pixbuf by Sven Neumann
   <sven@convergence.de>.

   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2 of the License, or (at your option) any later version.

   This library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with this library; if not, write to the
   Free Software Foundation, Inc., 59 Temple Place - Suite 330,
   Boston, MA 02111-1307, USA.
*/

#include <config.h>

#include <stdlib.h>
#include <stdio.h>

#include <pthread.h>

#include <directfb.h>

#include <core/core.h>
#include <core/coredefs.h>
#include <core/coretypes.h>

#include <core/palette.h>
#include <core/surfaces.h>

#include <direct/memcpy.h>
#include <direct/mem.h>
#include <direct/messages.h>
#include <direct/util.h>

#include <misc/util.h>
#include <misc/gfx_util.h>

#include <gfx/convert.h>


#define SUBSAMPLE_BITS 4
#define SUBSAMPLE (1 << SUBSAMPLE_BITS)
#define SUBSAMPLE_MASK ((1 << SUBSAMPLE_BITS)-1)
#define SCALE_SHIFT 16


typedef struct _PixopsFilter PixopsFilter;

struct _PixopsFilter {
     int *weights;
     int n_x;
     int n_y;
     float x_offset;
     float y_offset;
};


static void rgba_to_dst_format (__u8 *dst,
                                __u32 r, __u32 g, __u32 b, __u32 a,
                                CoreSurface *dst_surface, int dx)
{
     CorePalette *palette = dst_surface->palette;

     if (dst_surface->caps & DSCAPS_PREMULTIPLIED) {
          __u32 a1 = a + 1;

          r = (r * a1) >> 8;
          g = (g * a1) >> 8;
          b = (b * a1) >> 8;
     }

     switch (dst_surface->format) {
          case DSPF_RGB332:
               *((__u8*)dst) = PIXEL_RGB332( r, g, b );
               break;

          case DSPF_A8:
               *((__u8*)dst) = a;
               break;

          case DSPF_ARGB:
               *((__u32*)dst) = PIXEL_ARGB( a, r, g, b );
               break;

          case DSPF_ARGB1555:
               *((__u16*)dst) = PIXEL_ARGB1555( a, r, g, b );
               break;

          case DSPF_ARGB2554:
               *((__u16*)dst) = PIXEL_ARGB2554( a, r, g, b );
               break;

          case DSPF_ARGB4444:
               *((__u16*)dst) = PIXEL_ARGB4444( a, r, g, b );
               break;

          case DSPF_AiRGB:
               *((__u32*)dst) = PIXEL_AiRGB( a, r, g, b );
               break;

          case DSPF_RGB32:
               *((__u32*)dst) = PIXEL_RGB32( r, g, b );
               break;

          case DSPF_RGB16:
               *(__u16 *)dst = PIXEL_RGB16 (r, g, b);
               break;

          case DSPF_RGB24:
#ifdef WORDS_BIGENDIAN
               *dst++ = r;
               *dst++ = g;
               *dst   = b;
#else
               *dst++ = b;
               *dst++ = g;
               *dst   = r;
#endif
               break;

          case DSPF_LUT8:
               if (palette)
                    *dst++ = dfb_palette_search( palette, r, g, b, a );
               break;

          case DSPF_ALUT44:
               if (palette)
                    *dst++ = (a & 0xF0) + dfb_palette_search( palette, r, g, b, 0x80 );
               break;

          case DSPF_YUY2:
               if (! (dx & 1)) {  /* HACK */
                    __u32 y, cb, cr;

                    RGB_TO_YCBCR( r, g, b, y, cb, cr );

                    *((__u32*)dst) = PIXEL_YUY2( y, cb, cr );
               }
               break;

          case DSPF_UYVY:
               if (! (dx & 1)) {  /* HACK */
                    __u32 y, cb, cr;

                    RGB_TO_YCBCR( r, g, b, y, cb, cr );

                    *((__u32*)dst) = PIXEL_UYVY( y, cb, cr );
               }
               break;

          case DSPF_YV12:
          case DSPF_I420:
          case DSPF_NV12:
          case DSPF_NV21:
          case DSPF_NV16:
               D_ONCE( "format not fully supported (only luma plane, yet)" );

               *((__u8*)dst) = ((r * 16829 + g *  33039 + b *  6416 + 0x8000) >> 16) + 16;
               break;

          default:
               D_ONCE( "unimplemented destination format (0x%08x)", dst_surface->format );
               break;
     }
}

#define LINE_PTR(dst,caps,y,h,pitch) \
     ((caps & DSCAPS_SEPARATED) \
          ? (((__u8*)dst) + y/2 * pitch + ((y%2) ? h/2 * pitch : 0)) \
          : (((__u8*)dst) + y * pitch))

void dfb_copy_buffer_32( __u32 *src,
                         void  *dst, int dpitch, DFBRectangle *drect,
                         CoreSurface *dst_surface )
{
     int x, y;
     __u32 a;
     int bpp = DFB_BYTES_PER_PIXEL( dst_surface->format );

     switch (dst_surface->format) {
          case DSPF_A8:
               for (y = drect->y; y < drect->y + drect->h; y++) {
                    __u8 *d = LINE_PTR( dst, dst_surface->caps,
                                        y, dst_surface->height, dpitch );

                    for (x = drect->x; x < drect->x + drect->w; x++)
                         d[x] = src[x] >> 24;

                    src += drect->w;
               }
               break;

          case DSPF_ARGB:
               if (! (dst_surface->caps & DSCAPS_PREMULTIPLIED)) {
                    for (y = drect->y; y < drect->y + drect->h; y++) {
                         void *d = LINE_PTR( dst, dst_surface->caps,
                                             y, dst_surface->height, dpitch );

                         direct_memcpy (d + drect->x * 4, src, drect->w * 4);

                         src += drect->w;
                    }
                    break;
               }

          default:
               for (y = drect->y; y < drect->y + drect->h; y++) {
                    void *d = LINE_PTR( dst, dst_surface->caps,
                                        y, dst_surface->height, dpitch );

                    for (x = drect->x; x < drect->x + drect->w; x++) {
                         a = *src >> 24;

                         rgba_to_dst_format ((__u8 *)d,
                                             (*src & 0x00FF0000) >> 16,
                                             (*src & 0x0000FF00) >> 8,
                                             (*src & 0x000000FF),
                                             a,
                                             dst_surface,
                                             x);

                         d += bpp;

                         src++;
                    }
               }
               break;
     }
}

static int bilinear_make_fast_weights( PixopsFilter *filter, float x_scale, float y_scale )
{
     int i_offset, j_offset;
     float *x_weights, *y_weights;
     int n_x, n_y;

     if (x_scale > 1.0) {      /* Bilinear */
          n_x = 2;
          filter->x_offset = 0.5 * (1/x_scale - 1);
     }
     else {                    /* Tile */
          n_x = D_ICEIL (1.0 + 1.0 / x_scale);
          filter->x_offset = 0.0;
     }

     if (y_scale > 1.0) {      /* Bilinear */
          n_y = 2;
          filter->y_offset = 0.5 * (1/y_scale - 1);
     }
     else {                    /* Tile */
          n_y = D_ICEIL (1.0 + 1.0/y_scale);
          filter->y_offset = 0.0;
     }

     if (n_x > 64)
          n_x = 64;

     if (n_y > 64)
          n_y = 64;

     filter->n_y = n_y;
     filter->n_x = n_x;
     filter->weights = (int *) D_MALLOC( SUBSAMPLE * SUBSAMPLE * n_x * n_y *
                                         sizeof (int) );
     if (!filter->weights) {
          D_WARN ("couldn't allocate memory for scaling");
          return 0;
     }

     x_weights = (float *) alloca (n_x * sizeof (float));
     y_weights = (float *) alloca (n_y * sizeof (float));

     if (!x_weights || !y_weights) {
          D_FREE( filter->weights );

          D_WARN ("couldn't allocate memory for scaling");
          return 0;
     }

     for (i_offset = 0; i_offset < SUBSAMPLE; i_offset++)
          for (j_offset = 0; j_offset < SUBSAMPLE; j_offset++) {
               int *pixel_weights = filter->weights
                                    + ((i_offset * SUBSAMPLE) + j_offset)
                                    * n_x * n_y;

               float x = (float)j_offset / 16;
               float y = (float)i_offset / 16;
               int i,j;

               if (x_scale > 1.0) {     /* Bilinear */
                    for (i = 0; i < n_x; i++) {
                         x_weights[i] = ((i == 0) ? (1 - x) : x) / x_scale;
                    }
               }
               else {                   /* Tile */
                    for (i = 0; i < n_x; i++) {
                         if (i < x) {
                              if (i + 1 > x)
                                   x_weights[i] = MIN( i+ 1, x+ 1/x_scale ) -x;
                              else
                                   x_weights[i] = 0;
                         }
                         else {
                              if (x + 1/x_scale > i)
                                   x_weights[i] = MIN( i+ 1, x+ 1/x_scale ) -i;
                              else
                                   x_weights[i] = 0;
                         }
                    }
               }

               if (y_scale > 1.0) {     /* Bilinear */
                    for (i = 0; i < n_y; i++) {
                         y_weights[i] = ((i == 0) ? (1 - y) : y) / y_scale;
                    }
               }
               else {                   /* Tile */
                    for (i = 0; i < n_y; i++) {
                         if (i < y) {
                              if (i + 1 > y)
                                   y_weights[i] = MIN( i+ 1, y+ 1/y_scale ) -y;
                              else
                                   y_weights[i] = 0;
                         }
                         else {
                              if (y + 1/y_scale > i)
                                   y_weights[i] = MIN( i+ 1, y+ 1/y_scale ) -i;
                              else
                                   y_weights[i] = 0;
                         }
                    }
               }

               for (i = 0; i < n_y; i++) {
                    for (j = 0; j < n_x; j++) {
                         *(pixel_weights + n_x * i + j) =
                         65536 * x_weights[j] * x_scale
                         * y_weights[i] * y_scale;
                    }
               }
          }

     return 1;
}

static void scale_pixel( int *weights, int n_x, int n_y,
                         void *dst, __u32 **src,
                         int x, int sw, CoreSurface *dst_surface, int dx )
{
     __u32 r = 0, g = 0, b = 0, a = 0;
     int i, j;

     for (i = 0; i < n_y; i++) {
          int *pixel_weights = weights + n_x * i;

          for (j = 0; j < n_x; j++) {
               __u32  ta;
               __u32 *q;

               if (x + j < 0)
                    q = src[i];
               else if (x + j < sw)
                    q = src[i] + x + j;
               else
                    q = src[i] + sw - 1;

               ta = ((*q & 0xFF000000) >> 24) * pixel_weights[j];

               b += ta * (((*q & 0xFF)) + 1);
               g += ta * (((*q & 0xFF00) >> 8) + 1);
               r += ta * (((*q & 0xFF0000) >> 16) + 1);
               a += ta;
          }
     }

     r = (r >> 24) == 0xFF ? 0xFF : (r + 0x800000) >> 24;
     g = (g >> 24) == 0xFF ? 0xFF : (g + 0x800000) >> 24;
     b = (b >> 24) == 0xFF ? 0xFF : (b + 0x800000) >> 24;
     a = (a >> 16) == 0xFF ? 0xFF : (a + 0x8000) >> 16;

     rgba_to_dst_format( dst, r, g, b, a, dst_surface, dx );
}

static void *scale_line( int *weights, int n_x, int n_y, void *dst,
                         void *dst_end, __u32 **src, int x, int x_step, int sw,
                         CoreSurface *dst_surface, int dx )
{
     int i, j;
     int *pixel_weights;
     __u32 *q;
     __u32 r, g, b, a;
     int  x_scaled;
     int *line_weights;

     while (dst < dst_end) {
          r = g = b = a = 0;
          x_scaled = x >> SCALE_SHIFT;

          pixel_weights = weights + ((x >> (SCALE_SHIFT - SUBSAMPLE_BITS))
                                     & SUBSAMPLE_MASK) * n_x * n_y;

          for (i = 0; i < n_y; i++) {
               line_weights = pixel_weights + n_x * i;

               q = src[i] + x_scaled;

               for (j = 0; j < n_x; j++) {
                    __u32 ta;

                    ta = ((*q & 0xFF000000) >> 24) * line_weights[j];

                    b += ta * (((*q & 0xFF)) + 1);
                    g += ta * (((*q & 0xFF00) >> 8) + 1);
                    r += ta * (((*q & 0xFF0000) >> 16) + 1);
                    a += ta;

                    q++;
               }
          }

          r = (r >> 24) == 0xFF ? 0xFF : (r + 0x800000) >> 24;
          g = (g >> 24) == 0xFF ? 0xFF : (g + 0x800000) >> 24;
          b = (b >> 24) == 0xFF ? 0xFF : (b + 0x800000) >> 24;
          a = (a >> 16) == 0xFF ? 0xFF : (a + 0x8000) >> 16;

          rgba_to_dst_format( dst, r, g, b, a, dst_surface, dx++ );

          dst += DFB_BYTES_PER_PIXEL (dst_surface->format);
          x += x_step;
     }

     return dst;
}

void dfb_scale_linear_32( __u32 *src, int sw, int sh,
                          void  *dst, int dpitch, DFBRectangle *drect,
                          CoreSurface *dst_surface )
{
     float scale_x, scale_y;
     int i, j;
     int sx, sy;
     int x_step, y_step;
     int scaled_x_offset;
     PixopsFilter filter;

     if (sw < 1 || sh < 1 || drect->w < 1 || drect->h < 1)
          return;

     if (drect->w == sw && drect->h == sh) {
          dfb_copy_buffer_32( src, dst, dpitch, drect, dst_surface );
          return;
     }

     scale_x = (float)drect->w / sw;
     scale_y = (float)drect->h / sh;

     x_step = (1 << SCALE_SHIFT) / scale_x;
     y_step = (1 << SCALE_SHIFT) / scale_y;

     if (! bilinear_make_fast_weights( &filter, scale_x, scale_y ))
          return;

     scaled_x_offset = D_IFLOOR( filter.x_offset * (1 << SCALE_SHIFT) );
     sy = D_IFLOOR( filter.y_offset * (1 << SCALE_SHIFT) );

     for (i = drect->y; i < drect->y + drect->h; i++) {
          int x_start;
          int y_start;
          int dest_x;
          int *run_weights;
          void *outbuf;
          void *outbuf_end;
          void *new_outbuf;
          __u32 **line_bufs;

          y_start = sy >> SCALE_SHIFT;

          run_weights = filter.weights + ((sy >> (SCALE_SHIFT - SUBSAMPLE_BITS))
                                          & SUBSAMPLE_MASK) * filter.n_x * filter.n_y * SUBSAMPLE;

          line_bufs = (__u32 **) alloca( filter.n_y * sizeof (void *) );

          for (j = 0; j < filter.n_y; j++) {
               if (y_start <  0)
                    line_bufs[j] = src;
               else if (y_start < sh)
                    line_bufs[j] = src + sw * y_start;
               else
                    line_bufs[j] = src + sw * (sh - 1);

               y_start++;
          }

          outbuf = (LINE_PTR( dst, dst_surface->caps,
                              i, dst_surface->height, dpitch ) +
                    DFB_BYTES_PER_LINE( dst_surface->format, drect->x ));

          outbuf_end = outbuf + DFB_BYTES_PER_LINE( dst_surface->format, drect->w );
          sx = scaled_x_offset;
          x_start = sx >> SCALE_SHIFT;
          dest_x = 0;

          while (x_start < 0 && outbuf < outbuf_end) {
               scale_pixel( run_weights + ((sx >> (SCALE_SHIFT - SUBSAMPLE_BITS))
                                           & SUBSAMPLE_MASK) * (filter.n_x * filter.n_y),
                            filter.n_x, filter.n_y, outbuf, line_bufs,
                            sx >> SCALE_SHIFT, sw, dst_surface, dest_x );

               sx += x_step;
               x_start = sx >> SCALE_SHIFT;
               dest_x++;
               outbuf += DFB_BYTES_PER_PIXEL (dst_surface->format);
          }

          new_outbuf = scale_line (run_weights, filter.n_x, filter.n_y, outbuf,
                                   outbuf_end, line_bufs, sx >> SCALE_SHIFT,
                                   x_step, sw, dst_surface, dest_x);

          dest_x += (new_outbuf - outbuf) / DFB_BYTES_PER_PIXEL (dst_surface->format);
          sx = dest_x * x_step + scaled_x_offset;
          outbuf = new_outbuf;

          while (outbuf < outbuf_end) {
               scale_pixel( run_weights + ((sx >> (SCALE_SHIFT - SUBSAMPLE_BITS))
                                           & SUBSAMPLE_MASK) * (filter.n_x * filter.n_y),
                            filter.n_x, filter.n_y, outbuf, line_bufs,
                            sx >> SCALE_SHIFT, sw, dst_surface, dest_x);

               sx += x_step;
               outbuf += DFB_BYTES_PER_PIXEL (dst_surface->format);
          }

          sy += y_step;
     }

     D_FREE(filter.weights);
}

