/* $Id: cpl_fit.c,v 1.7 2007/08/07 08:05:33 llundin Exp $
 *
 * This file is part of the ESO Common Pipeline Library
 * Copyright (C) 2001-2004 European Southern Observatory
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

/*
 * $Author: llundin $
 * $Date: 2007/08/07 08:05:33 $
 * $Revision: 1.7 $
 * $Name:  $
 */

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

/*-----------------------------------------------------------------------------
                                   Includes
 -----------------------------------------------------------------------------*/

#include "cpl_tools.h"
#include "cpl_vector.h"
#include "cpl_matrix.h"
#include "cpl_memory.h"
#include "cpl_error.h"
#include "cpl_vector_fit_impl.h"
#include "cpl_matrix_impl.h"

#include <math.h>
#include <assert.h>

/*----------------------------------------------------------------------------*/
/**
 * @defgroup cpl_fit High-level functions for non-linear fitting
 *
 * This module provides a routine for non-linear fitting.
 *
 * @par Synopsis:
 * @code
 *   #include "cpl_fit.h"
 * @endcode
 */
/*----------------------------------------------------------------------------*/
/**@{*/

/*-----------------------------------------------------------------------------
                                   Defines
 -----------------------------------------------------------------------------*/

/* Used in cpl_fit_imagelist_polynomial_find_block_size()
   - it only needs to be corrected if it about 10 times too small,
   or a few times too big */
#ifndef L2_CACHE_BYTES
#define L2_CACHE_BYTES 262144
#endif

/*-----------------------------------------------------------------------------
                        Private function prototypes
 -----------------------------------------------------------------------------*/

static void cpl_fit_imagelist_polynomial_double(cpl_imagelist *,
                                                const cpl_matrix *,
                                                const cpl_matrix *,
                                                const cpl_vector *,
                                                const cpl_imagelist *,
                                                const cpl_vector *,
                                                double, cpl_image *);

static void cpl_fit_imagelist_polynomial_float(cpl_imagelist *,
                                               const cpl_matrix *,
                                               const cpl_matrix *,
                                               const cpl_vector *,
                                               const cpl_imagelist *,
                                               const cpl_vector *,
                                               double, cpl_image *);

static void cpl_fit_imagelist_polynomial_int(cpl_imagelist *,
                                             const cpl_matrix *,
                                             const cpl_matrix *,
                                             const cpl_vector *,
                                             const cpl_imagelist *,
                                             const cpl_vector *,
                                             double, cpl_image *);

static void cpl_fit_imagelist_residual_double(cpl_image *, int, int,
                                              const cpl_vector *,
                                              const cpl_vector *,
                                              const cpl_matrix *,
                                              const cpl_matrix *);

static void cpl_fit_imagelist_residual_float(cpl_image *, int, int,
                                             const cpl_vector *,
                                             const cpl_vector *,
                                             const cpl_matrix *,
                                             const cpl_matrix *);

static void cpl_fit_imagelist_residual_int(cpl_image *, int, int,
                                           const cpl_vector *,
                                           const cpl_vector *,
                                           const cpl_matrix *,
                                           const cpl_matrix *);

static void cpl_fit_imagelist_fill_double(cpl_imagelist *, int, int,
                                          const cpl_matrix *);
 
static void cpl_fit_imagelist_fill_float(cpl_imagelist *, int, int,
                                         const cpl_matrix *);
 
static void cpl_fit_imagelist_fill_int(cpl_imagelist *, int, int,
                                       const cpl_matrix *);

static int cpl_fit_imagelist_polynomial_find_block_size(int, int,
                                                        cpl_boolean,
                                                        cpl_type,
                                                        cpl_type,
                                                        cpl_type);

/*-----------------------------------------------------------------------------
                              Function code
 -----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/
/**
   @brief   Fit a function to a set of data
   @param   x        N x D matrix of the positions to fit.
                     Each matrix row is a D-dimensional position.
   @param   sigma_x  Uncertainty (one sigma, gaussian errors assumed)
                     assosiated with @em x. Taking into account the
             uncertainty of the independent variable is currently
             unsupported, and this parameter must therefore be set
             to NULL.
   @param   y        The N values to fit.
   @param   sigma_y  Vector of size N containing the uncertainties of
                     the y-values. If this parameter is NULL, constant
             uncertainties are assumed.
   @param   a        Vector containing M fit parameters. Must contain
                     a guess solution on input and contains the best
             fit parameters on output.
   @param   ia       Array of size M defining which fit parameters participate
                     in the fit (non-zero) and which fit parameters are held
             constant (zero). At least one element must be non-zero.
             Alternatively, pass NULL to fit all parameters.
   @param   f        Function that evaluates the fit function
                     at the position specified by the first argument (an array of
             size D) using the fit parameters specified by the second
             argument (an array of size M). The result must be output
             using the third parameter, and the function must return zero
             iff the evaluation succeded.
   @param   dfda     Function that evaluates the first order partial
                     derivatives of the fit function with respect to the fit
             parameters at the position specified by the first argument
             (an array of size D) using the parameters specified by the
             second argument (an array of size M). The result must
             be output using the third parameter (array of size M), and
             the function must return zero iff the evaluation succeded.
   @param relative_tolerance
                     The algorithm converges by definition if the relative
                     decrease in chi squared is less than @em tolerance
                     @em tolerance_count times in a row. Recommended default:
                     CPL_FIT_LVMQ_TOLERANCE
   @param tolerance_count
                     The algorithm converges by definition if the relative
                     decrease in chi squared is less than @em tolerance
                     @em tolerance_count times in a row. Recommended default:
                     CPL_FIT_LVMQ_COUNT
   @param max_iterations
                     If this number of iterations is reached without convergence,
                     the algorithm diverges, by definition. Recommended default:
                     CPL_FIT_LVMQ_MAXITER
   @param mse        If non-NULL, the mean squared error of the best fit is
                     computed.
   @param red_chisq  If non-NULL, the reduced chi square of the best fit is
                     computed. This requires @em sigma_y to be specified.
   @param covariance If non-NULL, the formal covariance matrix of the best
                     fit parameters is computed (or NULL on error). On success
             the diagonal terms of the covariance matrix are guaranteed
             to be positive. However, terms that involve a constant
             parameter (as defined by the input array @em ia) are
             always set to zero. Computation of the covariacne matrix
             requires @em sigma_y to be specified.


   @return  CPL_ERROR_NONE iff OK.

   This function makes a minimum chi squared fit of the specified function
   to the specified data set using a Levenberg-Marquardt algorithm.

   Possible #_cpl_error_code_ set in this function:
   - CPL_ERROR_NULL_INPUT if an input pointer other than @em sigma_x, @em
     sigma_y, @em mse, @em red_chisq or @em covariance is NULL.
   - CPL_ERROR_ILLEGAL_INPUT if an input matrix/vector is empty, if @em ia
     contains only zero values, if any of @em relative_tolerance,
     @em tolerance_count or max_iterations @em is non-positive, if N <= M
     and @em red_chisq is non-NULL, if any element of @em sigma_x or @em sigma_y
     is non-positive, or if evaluation of the fit function or its derivative
     failed.
   - CPL_ERROR_INCOMPATIBLE_INPUT if the dimensions of the input
     vectors/matrices do not match, or if chi square or covariance computation
     is requested and @em sigma_y is NULL.
   - CPL_ERROR_ILLEGAL_OUTPUT if memory allocation failed.
   - CPL_ERROR_CONTINUE if the Levenberg-Marquardt algorithm failed to converge.
   - CPL_ERROR_SINGULAR_MATRIX if the covariance matrix could not be computed.

*/
/*----------------------------------------------------------------------------*/

cpl_error_code
cpl_fit_lvmq(const cpl_matrix *x, const cpl_matrix *sigma_x,
             const cpl_vector *y, const cpl_vector *sigma_y,
             cpl_vector *a, const int ia[],
             int    (*f)(const double x[], const double a[], double *result),
             int (*dfda)(const double x[], const double a[], double result[]),
             double relative_tolerance,
             int tolerance_count,
             int max_iterations,
             double *mse,
             double *red_chisq,
             cpl_matrix **covariance)
{

    cpl_error_code status = CPL_ERROR_NONE;

    status = _cpl_fit_lvmq(x, sigma_x, y, sigma_y, a, ia, f, dfda,
                           relative_tolerance, tolerance_count,
                           max_iterations, mse, red_chisq, covariance);

    if (status != CPL_ERROR_NONE) {
        cpl_error_set_where("cpl_fit_lvmq");
    }

    return status;

}

/*----------------------------------------------------------------------------*/
/**
  @brief  Least-squares fit a polynomial to each pixel in a list of images
  @param  x_pos      The vector of positions to fit
  @param  values     The list of images with values to fit
  @param  mindeg     The smallest degree with a non-zero coefficient
  @param  maxdeg     The polynomial degree of the fit, at least mindeg
  @param  is_eqdist  True iff the x_pos values are known to be equidistant
  @param  pixeltype  The pixel-type of the created image list
  @param  fiterror   When non-NULL, the error of the fit
  @note   values and x_pos must have the same number of elements.
  @note   The created imagelist must be deallocated with cpl_imagelist_delete().
  @note   x_pos must have at least 1 + (maxdeg - mindeg) distinct values.
  @note   Any bad-pixel map in the input is ignored.
  @return The image list of the fitted polynomial coefficients or NULL on error.
  @see cpl_polynomial_fit_1d_create()

  For each pixel, a polynomial representing the relation value = P(x) is
  computed where:
  P(x) = x^{mindeg} * (a_0 + a_1 * x + ... + a_{nc-1} * x^{nc-1}),
  where mindeg >= 0 and maxdeg >= mindeg, and nc is the number of
  polynomial coefficients to determine, nc = 1 + (maxdeg - mindeg).

  The returned image list thus contains nc coefficient images,
  a_0, a_1, ..., a_{nc-1}.

  np is the number of sample points, i.e. the number of elements in x_pos
  and number of images in the image list.

  is_eqdist is ignored if mindeg is nonzero, otherwise
  is_eqdist may to be set to CPL_TRUE if and only if the values in x_pos are
  known a-priori to be equidistant when sorted, eg. (1,2,3,4) and (1,3,2,4),
  but not (1,2,4,6). Setting is_eqdist to CPL_TRUE while mindeg is zero
  eliminates certain round-off errors.

  Even though it is not an error, it is hardly useful to use an image of pixel
  type integer for the fitting error. An image of pixel type float should on
  the other hand be sufficient for most fitting errors.

  The call requires the following number of FLOPs, where
  nz is the number of pixels in any one image in the imagelist:

  2 * nz * nc * (nc + np) + np * nc^2 + nc^3/3 + O(nc * (nc + np)).

  If mindeg is zero an additional nz * nc^2 FLOPs are required.

  If fiterror is non-NULL an additional 2 * nz * nc * np FLOPs are required.

  Possible #_cpl_error_code_ set in this function:
  - CPL_ERROR_NULL_INPUT if an input const pointer is NULL
  - CPL_ERROR_ILLEGAL_INPUT if mindeg is negative or maxdeg is less than mindeg.
  - CPL_ERROR_INCOMPATIBLE_INPUT if x_pos and values have different lengths,
      or if fiterror is non-NULL with a different size than that of values,
      or if the input images do not all have the same dimensions and pixel type.
  - CPL_ERROR_DATA_NOT_FOUND if x_pos contains less than nc values.
  - CPL_ERROR_SINGULAR_MATRIX if x_pos contains less than nc distinct values.
  - CPL_ERROR_UNSUPPORTED_MODE if the chosen pixel type is not one of
        CPL_TYPE_DOUBLE, CPL_TYPE_FLOAT, CPL_TYPE_INT.
 */
/*----------------------------------------------------------------------------*/
cpl_imagelist * cpl_fit_imagelist_polynomial(const cpl_vector    * x_pos,
                                             const cpl_imagelist * values,
                                             int                   mindeg,
                                             int                   maxdeg,
                                             cpl_boolean           is_eqdist,
                                             cpl_type              pixeltype,
                                             cpl_image           * fiterror)
{

    cpl_imagelist   * self = NULL; /* Avoid (false) uninit warning */
    cpl_matrix      * mv;   /* The transpose of the Vandermonde matrix, V' */
    cpl_matrix      * mh;   /* Upper triangular part of SPD Hankel matrix,
                               H = V' * V */
    cpl_vector      * xhat;

    const cpl_image * value = cpl_imagelist_get_const(values, 0);
    const double    * dx;
    double            xmean;

    cpl_error_code   error;

    /* Number of unknowns to determine */
    const int         nc = 1 + maxdeg - mindeg;
    const int         np = cpl_vector_get_size(x_pos);
    const int         nx = cpl_image_get_size_x(value);
    const int         ny = cpl_image_get_size_y(value);

    const cpl_boolean is_eqzero = is_eqdist && mindeg == 0;

    int               i, j, k;


    cpl_ensure(x_pos  != NULL, CPL_ERROR_NULL_INPUT, NULL);
    cpl_ensure(values != NULL, CPL_ERROR_NULL_INPUT, NULL);

    cpl_ensure(mindeg >= 0,      CPL_ERROR_ILLEGAL_INPUT, NULL);
    cpl_ensure(maxdeg >= mindeg, CPL_ERROR_ILLEGAL_INPUT, NULL);

    assert( nc > 0);

    cpl_ensure(np == cpl_imagelist_get_size(values),
               CPL_ERROR_INCOMPATIBLE_INPUT, NULL);

    cpl_ensure(cpl_imagelist_is_uniform(values)==0,
               CPL_ERROR_INCOMPATIBLE_INPUT, NULL);

    cpl_ensure(pixeltype == CPL_TYPE_DOUBLE || pixeltype == CPL_TYPE_FLOAT ||
               pixeltype == CPL_TYPE_INT, CPL_ERROR_UNSUPPORTED_MODE, NULL);

    if (fiterror != NULL) {
        cpl_ensure(cpl_image_get_size_x(fiterror) == nx &&
                   cpl_image_get_size_y(fiterror) == ny,
                   CPL_ERROR_INCOMPATIBLE_INPUT, NULL);
    }

    cpl_ensure(np >= nc,   CPL_ERROR_DATA_NOT_FOUND, NULL);

    /* The Hankel matrix may be singular in such a fashion, that the pivot
       points in its Cholesky decomposition are positive due to rounding errors.
       To ensure that such singular systems are robustly detected, the number of
       distinct sampling points is counted.
    */

    cpl_ensure(!cpl_vector_ensure_distinct(x_pos, nc),
               CPL_ERROR_SINGULAR_MATRIX, NULL);
      
    if (mindeg == 0) {
        /* Transform: xhat = x - mean(x) */
        xhat = cpl_vector_transform_mean(x_pos, &xmean);
        assert( xhat != NULL );

    } else {
        xhat = (cpl_vector*)x_pos; /* xhat is not modified */
        xmean = 0.0;
    }

    dx = cpl_vector_get_data(xhat);

    /* Create matrices */
    mv = cpl_matrix_wrap(nc, np,
                         cpl_malloc(nc * np * sizeof(double)));

    /* Fill Vandermonde matrix */
    for (j=0; j < np; j++) {
        double f_prod = cpl_tools_ipow(dx[j], mindeg);
        cpl_matrix_set(mv,  0, j, f_prod);
        for (k=1; k < nc; k++) {
            f_prod *= dx[j];
            cpl_matrix_set(mv, k, j, f_prod);
        }
    }

    cpl_tools_add_flops( np * ( nc - 1) );

    if (xhat != x_pos) cpl_vector_delete(xhat);

    /* Form upper triangular part of the matrix of the normal equations,
       H = V' * V.
       As in cpl_polynomial_fit_1d_create() this could be done in
       O(nc * np) flops, rather than 2 * nc^2 * np, but this is
       negligible for any practical image size and is not done since
       mv still has to be formed in order to block-optimize the formation
       of the right-hand-size */
    mh = cpl_matrix_product_normal_create(mv);

    if (is_eqzero) {

        /* Ensure that the Hankel matrix has zeros on all odd skew diagonals
           - above the (non-skew) main diagonal */

        double * dmh = cpl_matrix_get_data(mh);

        for (i = 0; i < nc; i++) {
            for (j = i + 1; j < nc; j += 2) {
                dmh[nc * i + j] = 0.0;
            }
        }
    }

    /* Do an in-place Cholesky-decomposition of H into L, such that L * L' = H.
       This is an O(nc^3) operation, while the subsequent, repeated solve using
       L is only an O(nc^2) operation.
       Further, while the Cholesky-decomposition may fail, the subsequent solve
       is robust. */
    error = cpl_matrix_decomp_chol(mh);

    if (!error) {

        cpl_vector * xpow = NULL;

        /* Should not be able to fail at this point */

        /* Allocate nc images to store the results */
        self = cpl_imagelist_new();
        for (i=0; i < nc; i++) {
            /* Could also wrap around a cpl_malloc() */
            cpl_image * image = cpl_image_new(nx, ny, pixeltype);
 
            error = cpl_imagelist_set(self, image, i);
 
            assert( !error);
        }

        if (mindeg == 1) {
            xpow = (cpl_vector*)x_pos; /* xpow is not modified */
        } if (mindeg > 1) {
            const double * d_pos = cpl_vector_get_data_const(x_pos);
            double       * ppow  = (double*)cpl_malloc(np * sizeof(double));

            xpow  = cpl_vector_wrap(np, ppow);

            for (i = 0; i < np; i++) {
                ppow[i] = cpl_tools_ipow(d_pos[i], mindeg);
            }

        }

        switch (cpl_image_get_type(value)) {
        case CPL_TYPE_DOUBLE:
            cpl_fit_imagelist_polynomial_double(self, mh, mv, x_pos, values,
                                                xpow, -xmean, fiterror);
            break;
        case CPL_TYPE_FLOAT:
            cpl_fit_imagelist_polynomial_float(self, mh, mv, x_pos, values,
                                               xpow, -xmean, fiterror);
            break;
        case CPL_TYPE_INT:
            cpl_fit_imagelist_polynomial_int(self, mh, mv, x_pos, values,
                                             xpow, -xmean, fiterror);
            break;
        default:
            /* It is an error in CPL to reach this point */
            assert( 0 );
        }

        if (xpow != x_pos) cpl_vector_delete(xpow);

    }

    cpl_matrix_delete(mh);
    cpl_matrix_delete(mv);

    /* Propagate the error, if any */
    cpl_ensure(!error, error, NULL);

    return self;

}

/**@}*/

/*----------------------------------------------------------------------------*/
/**
  @brief   Determine the number of pixels that can be processed within the L2
  @param   np      The number of samplint points
  @param   nc      The number of coefficients to determine
  @param   ip      The pixel-type of the input image list
  @param   op      The pixel-type of the output image list
  @param   ep      The pixel-type of the fitting error
  @param   do_err  CPL_TRUE iff fiterror is to be computed
  @return  The number of pixels, or 1 if the cache is too small

  The speed of cpl_fit_imagelist_polynomial() is only reduced significantly
  if the estimated size of the L2-cache is off by about an order of magnitude or
  more, especially if the actual cache size is much smaller than assumed here.

 */
/*----------------------------------------------------------------------------*/
static int cpl_fit_imagelist_polynomial_find_block_size(int np, int nc,
                                                        cpl_boolean do_err,
                                                        cpl_type ip,
                                                        cpl_type op,
                                                        cpl_type ep)
{

    int blocksize;

    /* The storage [bytes] needed for mv and mh */
    int c0 = (nc * nc + nc * np) * sizeof(double);
    /* Storage per pixel needed for mx, mb and the input and output images */
    int c1 =  np * cpl_type_get_sizeof(ip) + nc * cpl_type_get_sizeof(op)
        + (nc + np) * sizeof(double);

    if (do_err) {
        /* The storage [bytes] needed for xpos and xpow */
        c0 += (2 * np) * sizeof(double);

        /* Storage per pixel needed for fitting error */
        c1 += cpl_type_get_sizeof(ep);
    }

    /* In principle the number of pixels that can be processed within the
       L2 cache would be (L2_CACHE_BYTES - c0) / c1.
       Apparently, the effective size
       of the cache is observed to be about four times smaller */

    blocksize = ((L2_CACHE_BYTES)/4 - 10 * c0 - 1024) / c1;

    return blocksize > 0 ? blocksize : 1;

}


/* Define the C-type dependent functions */

/* These two macros are needed for support of the different pixel types */

#define CONCAT(a,b) a ## _ ## b
#define CONCAT2X(a,b) CONCAT(a,b)
#define CONCAT3X(a,b,c) CONCAT2X(CONCAT2X(a,b),c)

#define CPL_TYPE double
#include "cpl_fit_body.h"
#undef CPL_TYPE

#define CPL_TYPE float
#include "cpl_fit_body.h"
#undef CPL_TYPE

#define CPL_TYPE_INT_ROUND(A) ((int)floor(0.5 + (A)))
#define CPL_TYPE int
#include "cpl_fit_body.h"
#undef CPL_TYPE
#undef CPL_TYPE_INT_ROUND