/* $Id: cpl_fit_body.h,v 1.2 2007/07/06 13:57:49 llundin Exp $
 *
 * This file is part of the ESO cpl package
 * Copyright (C) 2001-2007 European Southern Observatory
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#define CPL_TYPE_ADD(a) CONCAT2X(a, CPL_TYPE)
#define CPL_TYPE_ADD_CONST(a) CONCAT3X(a, CPL_TYPE, const)

/*----------------------------------------------------------------------------*/
/**
  @brief  Fit a polynomial to each pixel in a list of images
  @param  self     Preallocated imagelist to hold fitting result
  @param  ml       Lower triangular matrix L, L * L' = V' * V
  @param  mv       The transpose of the Vandermonde matrix, V'
  @param  x_pos    The vector of positions to fit (used for fiterror only)
  @param  values   The list of images with values to fit
  @param  xpow     The mindeg powers of x_pos (or NULL, when mindeg is zero)
  @param  xnmean   Minus the mean value of the x_pos elements (or zero)
  @param  fiterror When non-NULL, the error of the fit
  @return void
  @see cpl_fit_imagelist_polynomial

 */
/*----------------------------------------------------------------------------*/
static void
CPL_TYPE_ADD(cpl_fit_imagelist_polynomial)(cpl_imagelist * self,
                                           const cpl_matrix * ml,
                                           const cpl_matrix * mv,
                                           const cpl_vector * x_pos,
                                           const cpl_imagelist * values,
                                           const cpl_vector * xpow,
                                           double xnmean,
                                           cpl_image * fiterror)
{

    const cpl_type    pixeltype
        = cpl_image_get_type(cpl_imagelist_get(self, 0));

    const cpl_image * value = cpl_imagelist_get_const(values, 0);
    const int         np = cpl_imagelist_get_size(values);
    const int         nx = cpl_image_get_size_x(value);
    const int         ny = cpl_image_get_size_y(value);
    const int         nc = cpl_imagelist_get_size(self);
    const cpl_boolean do_err = fiterror != NULL ? CPL_TRUE : CPL_FALSE;

    const cpl_type    errtype
        = do_err ? cpl_image_get_type(fiterror) : CPL_TYPE_INVALID;


    /* Total number of fits to do */
    const int         imsize = nx * ny;
    /* Process this many fits in one go to improve cache usage
       - The code will be correct for any natural-number-value of ijblock,
       but a too small and a too large value will slow down the code. */
    const int         ijblock
        = cpl_fit_imagelist_polynomial_find_block_size(np, nc, do_err,
                                                          cpl_image_get_type(value),
                                                          pixeltype,
                                                          errtype);

    /* - except in that the last block may have fewer points */
    int               ijdo    = ijblock;

    const CPL_TYPE  * pi;
    double          * px  = cpl_malloc(ijblock * nc * sizeof(double));
    double          * pbw = cpl_malloc(ijblock * np * sizeof(double));
    /* The polynomial coefficients to be solved for
       - transposed, so the ijblock polynomials are stored consequtively */
    cpl_matrix      * mx  = cpl_matrix_wrap(ijblock, nc, px);
    /* The transpose of one row of values to be fitted */
    cpl_matrix      * mb  = cpl_matrix_wrap(ijblock, np, pbw); 
    int               i, j;
    int ij, ijstop;


    /* Process ijdo pixels at a time - Improves cache usage  */
    for (ij = 0; ij < imsize; ij += ijdo) {

        if (ij + ijdo > imsize) ijdo = imsize - ij;

        /* Upper limit for pixel index in this iteration */
        ijstop = ij + ijdo;
 
        /* Fill in matrices */
        for (j=0; j < np; j++) {
            pi = CPL_TYPE_ADD_CONST(cpl_image_get_data)
                (cpl_imagelist_get_const(values, j));
            /* The fact the mb is transposed makes this operation more
               expensive - which is OK, since it has a lower complexity
               than the subsequent use of mb */
            for (i=0; i < ijdo; i++) {
                pbw[np * i + j] = (double)pi[ij + i];
            }
        }

        /* Form the right hand side of the normal equations, X = V' * B */
        cpl_matrix_product_transpose(mx, mb, mv);

        /* In-place solution of the normal equations, L * L' * X = V' * B */
        cpl_matrix_solve_chol_transpose(ml, mx);

        if (xnmean != 0.0) {
            /* Shift polynomials back */
            for (i=0; i < ijdo; i++) {
                cpl_polynomial_shift_double(px + i * nc, nc, xnmean);
            }
        }

        /* Copy results to output image list */

        switch (pixeltype) {
            case CPL_TYPE_DOUBLE:
                cpl_fit_imagelist_fill_double(self, ij, ijstop, mx);
                break;
            case CPL_TYPE_FLOAT:
                cpl_fit_imagelist_fill_float(self, ij, ijstop, mx);
                break;
            case CPL_TYPE_INT:
                cpl_fit_imagelist_fill_int(self, ij, ijstop, mx);
                break;
            default:
                /* It is an error in CPL to reach this point */
                assert( 0 );
        }

        if (fiterror != NULL) {
            switch (cpl_image_get_type(fiterror)) {
            case CPL_TYPE_DOUBLE:
                cpl_fit_imagelist_residual_double(fiterror, ij, ijstop,
                                                     x_pos, xpow, mx, mb);
                break;
            case CPL_TYPE_FLOAT:
                cpl_fit_imagelist_residual_float(fiterror, ij, ijstop,
                                                    x_pos, xpow, mx, mb);
                break;
            case CPL_TYPE_INT:
                cpl_fit_imagelist_residual_int(fiterror, ij, ijstop,
                                                  x_pos, xpow, mx, mb);
                break;
            default:
                /* It is an error in CPL to reach this point */
                assert( 0 );
            }
        }
    }
    cpl_matrix_delete(mx);
    cpl_matrix_delete(mb);

}


/*----------------------------------------------------------------------------*/
/**
  @brief  Fill a row in the imagelist with the contents of the matrix
  @param  self     Preallocated imagelist
  @param  jj       The index of the image row to fill
  @param  mx       The transpose of the computed fitting coefficients
  @return void

  The fact the mx is transposed makes this operation more expensive
  - which is OK, since it has a lower complexity than the other use of mx

 */
/*----------------------------------------------------------------------------*/
static void CPL_TYPE_ADD(cpl_fit_imagelist_fill)(cpl_imagelist * self,
                                                 int ij, int ijstop,
                                                 const cpl_matrix * mx)
{

    const int      ijdo = ijstop - ij;
    const int      nc = cpl_matrix_get_ncol(mx);
    const double * px = cpl_matrix_get_data_const(mx);
    int            i, k;


    assert(cpl_imagelist_get_size(self) == nc);

    for (k=0; k < nc; k++) {
        CPL_TYPE * pdest = ij
            + CPL_TYPE_ADD(cpl_image_get_data)(cpl_imagelist_get(self, k));
        for (i=0; i < ijdo; i++) {
#ifdef CPL_TYPE_INT_ROUND
            /* Round off to nearest integer */
            pdest[i] = CPL_TYPE_INT_ROUND(px[nc * i + k]);
#else
            pdest[i] = (CPL_TYPE)px[nc * i + k];
#endif
        }
    }
}

/*----------------------------------------------------------------------------*/
/**
  @brief  Compute the residual of a polynomial fit to an imagelist
  @param  self     Preallocated image to hold residual
  @param  jj       The index of the image row to compute
  @param  x_pos    The vector of positions to fit (used for fiterror only)
  @param  xpow     The mindeg powers of x_pos (or NULL, when mindeg is zero)
  @param  mx       The transpose of the computed fitting coefficients
  @param  mb       The transpose of the values to be fitted
  @return void
  @see cpl_fit_imagelist_polynomial()

  The call requires (ijstop - ij) * (np * (2 * nc + 1) + 1) FLOPs.

 */
/*----------------------------------------------------------------------------*/
static void CPL_TYPE_ADD(cpl_fit_imagelist_residual)(cpl_image * self,
                                                     int ij, int ijstop,
                                                     const cpl_vector * x_pos,
                                                     const cpl_vector * xpow,
                                                     const cpl_matrix * mx,
                                                     const cpl_matrix * mb)
{
    double         err, sq_err;

    const int      nc = cpl_matrix_get_ncol(mx);
    const int      np = cpl_vector_get_size(x_pos);

    /* pself points to 1st element in jj'th row */
    CPL_TYPE     * pself = CPL_TYPE_ADD(cpl_image_get_data)(self);

    const double * pp = cpl_vector_get_data_const(x_pos);

    /* If mindeg == 0 xpow is NULL (since no multiplication is needed) */
    const double * pm = xpow != NULL ? cpl_vector_get_data_const(xpow) : NULL;
    const double * px = cpl_matrix_get_data_const(mx);
    const double * pb = cpl_matrix_get_data_const(mb);

    int i, j, k;


    for (i=ij; i < ijstop; i++, pb += np, px += nc) {
        sq_err = 0.0;
        for (j=0; j < np; j++) {

            /* Evaluate the polynomial using Horners scheme, see
               cpl_polynomial_eval_1d() */
            k = nc;
            err = px[--k];

            while (k) err = pp[j] * err + px[--k];

            /* Multiply by x^mindeg - wheen needed */
            /* Substract expected value to compute the residual */
            if (pm != NULL) {
                err = err * pm[j] - pb[j];
            } else {
                err -= pb[j];
            }

            sq_err += err * err;
        }
#ifdef CPL_TYPE_INT_ROUND
        /* Round off to nearest integer */
        pself[i] = CPL_TYPE_INT_ROUND(sq_err/(double)np);
#else
        pself[i] = (CPL_TYPE)(sq_err/(double)np);
#endif
    }

    cpl_tools_add_flops((ijstop - ij) * (np * (2 * nc + 1) + 1) );

}

#undef CPL_TYPE_ADD