/* $Id: cpl_polynomial.c,v 1.79 2007/07/30 08:05:26 llundin Exp $
 *
 * This file is part of the ESO Common Pipeline Library
 * Copyright (C) 2001-2004 European Southern Observatory
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

/*
 * $Author: llundin $
 * $Date: 2007/07/30 08:05:26 $
 * $Revision: 1.79 $
 * $Name:  $
 */

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

/*-----------------------------------------------------------------------------
                                   Includes
 -----------------------------------------------------------------------------*/

#include <stdlib.h>
#include <string.h>
#include <float.h>
#include <math.h>

#include <cxmessages.h>

#include "cpl_tools.h"
#include "cpl_matrix_impl.h"
#include "cpl_memory.h"
#include "cpl_polynomial.h"

/*----------------------------------------------------------------------------*/
/**
 * @defgroup cpl_polynomial Polynomials 
 *
 * This module provides functions to handle uni- and multivariate polynomials.
 *
 * Univariate polynomials use the Horner rule for evaluation, while 
 * multivariate polynomials are evaluated simply as the sum of each term.
 *
 * This means that of the two polynomials
 * @verbatim
 * P1(x) = p0 + p1.x + p4.x^2
 * @endverbatim
   and
 * @verbatim
 * P2(x,y) = p0 + p1.x + p2.y + p3.x.y + p4.x^2 + p5.y^2
 * @endverbatim
 * P1(x) may evaluate to more accurate results than P2(x,0),
 * especially around the roots.
 *
 * Note that a polynomial like P3(z) = p0 + p1.z + p2.z^2 + p3.z^3, z=x^4
 * is preferable to p4(x) = p0 + p1.x^4 + p2.x^8 + p3.x^12.
 *
 */
/*----------------------------------------------------------------------------*/
/**@{*/

/*-----------------------------------------------------------------------------
                                   Type definition
 -----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/
/*
                  Polynomial object.

  If you want to store the following polynomial:
  @verbatim
  p(x,y) = p0 + p1.x + p2.y + p3.x.y + p4.x^2 + p5.y^2
  @endverbatim
  You would internally have:
  @verbatim
  nc  = 6 (from 0 to 5 incl.)
  dim = 2 (x and y)
  c[]  pow[] contains (pow[] is stored row-wise):
  p0   0 0
  p1   1 0
  p2   0 1
  p3   1 1
  p4   2 0
  p5   0 2
  @endverbatim
  i.e. the i'th coefficient is multiplied by the variable in dimension j
  lifted to this power:
  pow[dim * i + j]

  1-dimensional polynomials are stored differently:
  pow[] is not used.  dim == 1.
  Instead all coefficients (including those with zero-value) are stored
  in order of increasing power - in c.
  Thus the constant coefficient is stored in c[0] while the leading coefficient
  is stored in c[nc-1].
  This storage scheme allows for the usage of the Horner rule.

 */
/*----------------------------------------------------------------------------*/

struct _cpl_polynomial_ {

    /* The dimension of the polynomial */
    int         dim;

    /* if dim == 1: The number of coefficients in c (degree + 1).
       if dim >  1: The number of (non-zero) coefficients in c. */
    int         nc;

    /* The coefficients. If dim == 1: c0, c_1, ..., c_{nc - 1} */
    double  *   c;

    /* If dim == 1: Not used
       If dim > 1: The nc * dim powers of the variables x1, x2, ..., xdim */
    int     *  pow;

    /* If dim == 1: Not used
       If dim > 1: Temporary storage to speed up evaluation
       max_degree[i] is the largest power used for dimension i.
       eval_pow[i] holds space for 1+max_degree[i] double's. */
    int * max_degree;
    double ** eval_pow;
};

/*-----------------------------------------------------------------------------
                        Private function prototypes
 -----------------------------------------------------------------------------*/

static cpl_error_code cpl_polynomial_delete_coeff(cpl_polynomial *,
                                                  const int *);
static void cpl_matrix_fill_normal_vandermonde(cpl_matrix *, cpl_matrix *,
                                               const cpl_vector *, cpl_boolean,
                                               const cpl_vector *);

/*-----------------------------------------------------------------------------
                              Function codes
 -----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/
/**
  @brief    Create a new cpl_polynomial
  @param    dim    The positive polynomial dimension (number of variables)
  @return   1 newly allocated cpl_polynomial, or NULL on error

  The returned object must be deallocated using cpl_polynomial_delete().

  A newly created polynomial has degree 0 and evaluates as 0.

  Possible #_cpl_error_code_ set in this function:
  - CPL_ERROR_ILLEGAL_INPUT if dim is negative or zero
 */
/*----------------------------------------------------------------------------*/
cpl_polynomial * cpl_polynomial_new(int dim)
{
    cpl_polynomial * p;

    /* Test input */
    cpl_ensure(dim > 0, CPL_ERROR_ILLEGAL_INPUT, NULL);
    
    /* Allocate struct - and set nc == 0 */
    p = cpl_calloc(1, sizeof(cpl_polynomial));
    p->dim = dim;
   
    return p;
}

/*----------------------------------------------------------------------------*/
/**
  @brief    Delete a cpl_polynomial 
  @param    p    cpl_polynomial to delete
  @return   void

 */
/*----------------------------------------------------------------------------*/
void cpl_polynomial_delete(cpl_polynomial * p)
{

    if (p == NULL) return;

    if (p->nc > 0) {
        cpl_free(p->c);
        if (p->dim > 1) {
            while (p->dim--) {
                cpl_free(p->eval_pow[p->dim]);
            }
            cpl_free(p->pow);
            cpl_free(p->eval_pow);
            cpl_free(p->max_degree);
        }
    }

    cpl_free(p);

    return;
}

/*----------------------------------------------------------------------------*/
/**
  @brief    Dump a cpl_polynomial as ASCII to a stream
  @param    p       Input cpl_polynomial to dump
  @param    stream  Output stream, accepts @c stdout or @c stderr
  @return   CPL_ERROR_NONE or the relevant #_cpl_error_code_

  Each coefficient is preceded by its integer power(s) and
  written on a single line.
  If the polynomial has non-zero coefficients, only those are printed,
  otherwise the (zero-valued) constant term is printed.

  Comment lines start with the hash character.

  Possible #_cpl_error_code_ set in this function:
  - CPL_ERROR_NULL_INPUT if an input pointer is NULL
  - CPL_ERROR_FILE_IO if the write operation fails
 */
/*----------------------------------------------------------------------------*/
cpl_error_code cpl_polynomial_dump(const cpl_polynomial * p,
                                   FILE                 * stream)
{
    const cpl_error_code err = CPL_ERROR_FILE_IO;
    int i, dim;


    cpl_ensure_code(stream != NULL, CPL_ERROR_NULL_INPUT);
    cpl_ensure_code(p      != NULL, CPL_ERROR_NULL_INPUT);

    cpl_ensure_code( fprintf(stream, 
        "#----- %d dimensional polynomial -----\n", p->dim) > 0, err);

    for (i=0 ; i < p->dim ; i++)
        cpl_ensure_code(fprintf(stream, "%d.dim.power  ", i+1) > 0, err);

    cpl_ensure_code(fprintf(stream, "coefficient\n") > 0,  err);

    if (p->nc == 0) {
        for (dim=0 ; dim < p->dim ; dim++)
            cpl_ensure_code(fprintf(stream, "  %5d      ", 0) > 0, err);

        cpl_ensure_code(fprintf(stream, "0\n") > 0,  err);
    }

    for (i=0 ; i < p->nc ; i++) {
        if (p->dim == 1) {
            if (p->c[i] == 0.0) continue; /* Only non-zero coefficients */
            cpl_ensure_code(fprintf(stream, "  %5d      ", i) > 0,  err);
        } else
            for (dim=0 ; dim < p->dim ; dim++)
                cpl_ensure_code(fprintf(stream, "  %5d      ",
                                        p->pow[p->dim * i + dim]) > 0, err);

        cpl_ensure_code(fprintf(stream, "%g\n", p->c[i]) > 0, err);

    }

    cpl_ensure_code(
        fprintf(stream, "#------------------------------------\n") > 0, err);

    return CPL_ERROR_NONE;
}

/*----------------------------------------------------------------------------*/
/**
  @brief    This function duplicates an existing polynomial
  @param    p   The input cpl_polynomial
  @return   A newly allocated cpl_polynomial or NULL on error

  Notice that the returned object is a newly allocated cpl_polynomial that 
  must be deallocated by the caller using cpl_polynomial_delete().

  Possible #_cpl_error_code_ set in this function:
  - CPL_ERROR_NULL_INPUT if an input pointer is NULL
*/
/*----------------------------------------------------------------------------*/
cpl_polynomial * cpl_polynomial_duplicate(const cpl_polynomial * p)
{
    cpl_polynomial      *   out;


    /* Test inputs */
    cpl_ensure(p != NULL,  CPL_ERROR_NULL_INPUT, NULL);

    if (p->nc == 0) return cpl_polynomial_new(p->dim);

    /* Create a new cpl_polynomial and copy the members */
    out = cpl_malloc(sizeof(cpl_polynomial));
    memcpy(out, p, sizeof(cpl_polynomial));

    /* Copy the coefficients */
    out->c = cpl_malloc(out->nc * sizeof(double));
    memcpy(out->c, p->c, out->nc * sizeof(double));

    if (out->dim > 1) {
        /* Copy the powers - and eval-workspace */
        int dim;

        out->pow = cpl_malloc(out->nc * out->dim * sizeof(int));
        memcpy(out->pow, p->pow, out->nc * out->dim * sizeof(int));

        out->max_degree = cpl_malloc(out->dim * sizeof(int));
        memcpy(out->max_degree, p->max_degree, out->dim * sizeof(int));

        out->eval_pow = cpl_malloc(out->dim * sizeof(double*));

        for (dim=0 ; dim < out->dim ; dim++) {

           out->eval_pow[dim] = cpl_malloc((1+out->max_degree[dim]) * sizeof(double));
           out->eval_pow[dim][0] = 1.0; /* Always 1 */
        }

    }

    return out;
}

/*----------------------------------------------------------------------------*/
/**
  @brief    This function copies contents of a polynomial into another one 
  @param    out  Pre-allocated output cpl_polynomial
  @param    in   Input cpl_polynomial
  @return   CPL_ERROR_NONE or the relevant #_cpl_error_code_

  in and out must point to different polynomials.

  If out already contains coefficients, they are overwritten.

  This is the only function that can modify the dimension of a polynomial.

  Possible #_cpl_error_code_ set in this function:
  - CPL_ERROR_NULL_INPUT if an input pointer is NULL
  - CPL_ERROR_INCOMPATIBLE_INPUT if in and out point to the same polynomial
*/
/*----------------------------------------------------------------------------*/
cpl_error_code cpl_polynomial_copy(cpl_polynomial * out,
                                   const cpl_polynomial * in)
{

    
    /* Check input */
    cpl_ensure_code(out != NULL, CPL_ERROR_NULL_INPUT);
    cpl_ensure_code(in  != NULL, CPL_ERROR_NULL_INPUT);
    cpl_ensure_code(in  != out,  CPL_ERROR_INCOMPATIBLE_INPUT);


    if (out->dim == in->dim && in->nc > 0 && out->nc > 0) {
        /* The polynomials have equal dimension
          - realloc() if needed and just copy all buffers */

        if (in->dim > 1) {
            /* Copy the powers */
            int dim;

            if (out->nc != in->nc) out->pow = cpl_realloc(out->pow, in->dim *
                                                          in->nc * sizeof(int));
            memcpy(out->pow, in->pow, in->dim * in->nc * sizeof(int));

            memcpy(out->max_degree, in->max_degree, in->dim * sizeof(int));

            for (dim=0 ; dim < in->dim ; dim++) {
                out->eval_pow[dim] =
                    cpl_realloc(out->eval_pow[dim],
                            (1+out->max_degree[dim]) * sizeof(double));
            }
        }

        /* Resize the coefficient buffer  */
        if (out->nc != in->nc) {
            out->nc = in->nc;
            out->c = cpl_realloc(out->c, in->nc * sizeof(double));
        }

        memcpy(out->c, in->c, in->nc * sizeof(double));

    } else {
        /* When the dimensions are different,
           or if one of the polynomials have no coefficients,
           avoiding the malloc()/free()'s is too complicated
           - instead just duplicate the source and swap it
           into place */
        cpl_polynomial * tmp  = cpl_polynomial_duplicate(in);
        void           * swap = cpl_malloc(sizeof(cpl_polynomial));


        memcpy(swap, out,  sizeof(cpl_polynomial));
        memcpy(out,  tmp,  sizeof(cpl_polynomial));
        memcpy(tmp,  swap, sizeof(cpl_polynomial));

        /* Delete the buffers originally used by out - and the swap space */
        cpl_polynomial_delete(tmp);
        cpl_free(swap);

    }

    return CPL_ERROR_NONE;
}

/*----------------------------------------------------------------------------*/
/**
  @brief    Compare the coefficients of two polynomials
  @param    p1   the 1st polynomial
  @param    p2   the 2nd polynomial
  @param    tol  The absolute (non-negative) tolerance
  @return   0 when equal, positive when different, negative on error.

  The two polynomials are considered equal iff they have identical
  dimensions and the absolute difference between their coefficients
  does not exceed the tolerance.

  This means that the following pair of polynomials per definition are
  considered different:
  P1(x1,x2) = 3*x1 different from P2(x1) = 3*x1.

  If all parameters are valid and p1 and p2 point to the same polynomial the
  functions returns 0.

  Possible #_cpl_error_code_ set in this function:
  - CPL_ERROR_NULL_INPUT if an input pointer is NULL
  - CPL_ERROR_ILLEGAL_INPUT if tol is negative 
 */
/*----------------------------------------------------------------------------*/
#define CPL_POLYNOMIAL_CMP                                              \
                /* Verify that it differs within tolerance */           \
                if (fabs(p1->c[i] - p2->c[j]) <= tol) {                 \
                /* Verify that the powers match */                      \
                    for (dim=0; dim < p1->dim; dim++)                   \
                        if (p1->pow[p1->dim * i + dim]                  \
                         != p2->pow[p1->dim * j + dim]) break;          \
                    if (dim == p1->dim) break;   /* - found it */       \
                }

int cpl_polynomial_compare(const cpl_polynomial * p1, const cpl_polynomial * p2,
                           double tol)
{

    int i;

    /* Test parameters */
    cpl_ensure(p1 != NULL, CPL_ERROR_NULL_INPUT, -1);
    cpl_ensure(p2 != NULL, CPL_ERROR_NULL_INPUT, -2);

    cpl_ensure(tol >= 0,   CPL_ERROR_ILLEGAL_INPUT, -5);


    if (p1 == p2) return 0;

    if (p1->dim != p2->dim) return 1;

    if (p1->dim == 1) {
        /* If the degrees differ, the leading coefficient(s) must be
           smaller than tol */
        for (i = p1->nc; i > p2->nc; i--) if (fabs(p1->c[i]) > tol) return 2;
        for (i = p2->nc; i > p1->nc; i--) if (fabs(p2->c[i]) > tol) return 3;

        /* At this point i is equal to min(p1->nc, p2->nc) */
        while (i--) if (fabs(p1->c[i]-p2->c[i]) > tol) return 4;

    } else {
        /* For each 'non-zero' coefficient in p1, the same one must be in p2
           and vice-versa */
        int * found2 = cpl_calloc(p2->nc, sizeof(int));
        int dim;
        int j;

        /* Verify that each coefficient in p1 is either below tol or
           present in p2. Remember which of the coefficients in p2 were
           found */
        for (i = 0; i < p1->nc; i++) {
            if (fabs(p1->c[i]) <= tol) continue;
            for (j = 0; j < p2->nc; j++) {
                if (found2[j]) continue; /* Use a coefficient just once */
                CPL_POLYNOMIAL_CMP;
            }
            /* Verify that one was found */
            if (j == p2->nc) break;
            found2[j] = TRUE;
        }
        if (i < p1->nc) {
            cpl_free(found2);
            return 5;
        }
        for (j = 0; j < p2->nc; j++) {
            if (found2[j] || fabs(p2->c[j]) <= tol) continue;
            for (i=0; i < p1->nc; i++) {
                CPL_POLYNOMIAL_CMP;
            }
            /* Verify that one was found */
            if (i == p1->nc) break;
        }
        cpl_free(found2);

        if (j < p2->nc) return 6;
    }

    return 0;

}

/*----------------------------------------------------------------------------*/
/**
  @brief    The degree of the polynomial
  @param    p   the polynomial
  @return   The degree or negative on error

  The degree is the highest sum of exponents (with a non-zero coefficient).

  If there are no non-zero coefficients the degree is zero.

  Possible #_cpl_error_code_ set in this function:
  - CPL_ERROR_NULL_INPUT if an input pointer is NULL
 */
/*----------------------------------------------------------------------------*/
int cpl_polynomial_get_degree(const cpl_polynomial * p)
{
    int degree;


    /* Test parameters */
    cpl_ensure(p != NULL, CPL_ERROR_NULL_INPUT, -1);

    if (p->nc == 0) return 0;

    if (p->dim == 1) {
        return p->nc-1;
    } else {
        int dim;
        int sum;
        int i;

        degree = -1;
        for (i = 0; i < p->nc; i++) {
            if (p->c[i] == 0) continue;
            sum = 0;
            for (dim = 0; dim < p->dim; dim++) sum += p->pow[p->dim * i + dim];
            if (sum > degree) degree = sum;
        }
    }

    /* User may have reset leading coefficient(s) to zero */
    if (degree < 0) return 0;

    return degree;
}

/*----------------------------------------------------------------------------*/
/**
  @brief    The dimension of the polynomial
  @param    p   the polynomial
  @return   The dimension or negative on error

  Possible #_cpl_error_code_ set in this function:
  - CPL_ERROR_NULL_INPUT if an input pointer is NULL
 */
/*----------------------------------------------------------------------------*/
int cpl_polynomial_get_dimension(const cpl_polynomial * p) 
{


    /* Test entries */
    cpl_ensure(p != NULL, CPL_ERROR_NULL_INPUT, -1);

    return p->dim;
}

/*----------------------------------------------------------------------------*/
/**
  @brief    Get a coefficient of the polynomial
  @param    in      the input polynomial
  @param    pows    the powers of the different variables
  @return   The coefficient or undefined on error

  The array pows must have the size of the polynomial dimension and have
  non-negative elements.

  It is allowed to specify a (set of) power(s) for which no coefficient
  has previously been set. In this case the function returns zero.

  Possible #_cpl_error_code_ set in this function:
  - CPL_ERROR_NULL_INPUT if an input pointer is NULL
  - CPL_ERROR_ILLEGAL_INPUT if pows contains negative values
 */
/*----------------------------------------------------------------------------*/
double cpl_polynomial_get_coeff(
        const cpl_polynomial    *   in, 
        const int               *   pows)
{
    double coef = 0.0; /* Default value */
    int dim;


    /* Test entries */
    cpl_ensure(in,  CPL_ERROR_NULL_INPUT, coef);
    cpl_ensure(pows, CPL_ERROR_NULL_INPUT, coef);

    for (dim = 0 ;  dim < in->dim ; dim++) 
        cpl_ensure(pows[dim] >= 0, CPL_ERROR_ILLEGAL_INPUT, coef);
    
    /* Find the coeff */
    if (in->dim == 1) {
        if (pows[0] < in->nc) coef = in->c[pows[0]];
    } else {
        int i;
        for (i=0 ; i < in->nc ; i++) {
            if (!memcmp(in->pow + in->dim * i, pows, in->dim * sizeof(int)))
                break; /* Found the right combination of powers */
        }
        if (i < in->nc) coef = in->c[i];
    }
    return coef;
}

/*----------------------------------------------------------------------------*/
/**
  @brief    Set a coefficient of the polynomial
  @param    in      the input polynomial
  @param    pows    the powers of the different variables
  @param    c       the coefficient
  @return   CPL_ERROR_NONE or the relevant #_cpl_error_code_

  The array pows must have the size of the polynomial dimension and have
  non-negative elements.

  If the coefficient is already there, it is overwritten, if not, a new
  coefficient is added to the polynomial. This may cause the degree of the
  polynomial to be increased.

  Setting the coefficient of x1^4 * x3^2 in the 4 dimensional polynomial p to 
  12.3 would be performed by:

  cpl_polynomial_set_coeff(p, pows, 12.3);
  where pows is the integer array [4, 0, 2, 0].

  For efficiency reasons the coefficients of a 1d-polynomial are best inserted
  with the leading coefficient first.

  Possible #_cpl_error_code_ set in this function:
  - CPL_ERROR_NULL_INPUT if an input pointer is NULL
  - CPL_ERROR_ILLEGAL_INPUT if pows contains negative values
 */
/*----------------------------------------------------------------------------*/
cpl_error_code cpl_polynomial_set_coeff(cpl_polynomial * in,
                                        const int      * pows,
                                        double           c)
{
    int          newnc;
    int          dim;
    int          ind  = -1; /* Assume coefficient is absent */

    
    /* Test entries */
    cpl_ensure_code(in,  CPL_ERROR_NULL_INPUT);
    cpl_ensure_code(pows, CPL_ERROR_NULL_INPUT);
    for (dim=0 ; dim < in->dim ; dim++) 
        cpl_ensure_code(pows[dim] >= 0, CPL_ERROR_ILLEGAL_INPUT);

    /* Handle zero-valued coefficients as a special case */
    if (c == 0.0) return cpl_polynomial_delete_coeff(in, pows);

    /* Find the coeff */
    if (in->dim == 1) {
        if (pows[0] < in->nc) ind = pows[0];
    } else {    
        int i;
        for (i=0 ; i < in->nc ; i++) {
            if (!memcmp(in->pow + in->dim * i, pows, in->dim * sizeof(int)))
                break; /* Found the right combination of powers */
        }
        if (i < in->nc) ind = i;
    }

    /* The coefficient already exists : replace it and return */
    if (ind >= 0) {

        in->c[ind] = c;

        return CPL_ERROR_NONE;
    }


    /* The coefficient is a new one */
    /* Extend the arrays and update counter */

    /* New size. 1D-polynomials can grow with more than one coefficient */
    newnc = in->dim == 1 ? 1 + pows[0] : 1 + in->nc;

    in->c = in->nc ? cpl_realloc(in->c, newnc * sizeof(double))
                   : cpl_malloc(newnc * sizeof(double));

    if (in->dim == 1) {
        /* Initialize coefficients between existing and new one to zero */
        if (newnc - in->nc > 1)
            memset(in->c + in->nc, 0, (newnc - in->nc - 1) * sizeof(double));
    } else {
        /* Extend and copy the power arrays */

        in->pow = newnc == 1 ? cpl_malloc(in->dim * newnc * sizeof(int))
            : cpl_realloc(in->pow, in->dim * newnc * sizeof(int));

        memcpy(in->pow + in->dim * in->nc, pows, in->dim * sizeof(int));

        if (in->nc == 0) {

            in->max_degree = cpl_malloc(in->dim * sizeof(int));
            in->eval_pow   = cpl_malloc(in->dim * sizeof(double*));

            for (dim=0 ; dim < in->dim ; dim++) {
                in->max_degree[dim] = pows[dim];
                in->eval_pow[dim] = cpl_malloc((1+in->max_degree[dim]) * sizeof(double));
                in->eval_pow[dim][0] = 1.0; /* Always 1 */
            }

        } else {
            for (dim = 0; dim < in->dim; dim++) {
                if (pows[dim] <= in->max_degree[dim]) continue;
                in->max_degree[dim] = pows[dim];
                in->eval_pow[dim] =
                    cpl_realloc(in->eval_pow[dim],
                                (1+in->max_degree[dim]) * sizeof(double));

            }
        }
    }

    /* Set the new number of coefficients */
    in->nc = newnc;

    /* Set the new coefficient */
    in->c[in->nc-1] = c;

    return CPL_ERROR_NONE;
}

/*----------------------------------------------------------------------------*/
/**
  @brief    Evaluate the polynomial at the given point
  @param    p  The polynomial
  @param    x  Point of evaluation
  @return   The computed value or undefined on error.

  The length of x must be the polynomial dimension.

  A polynomial with no non-zero coefficents evaluates as 0.

  For 1-dimensional polynomials the call requires 2n FLOPs
  where n+1 is the number of coefficients in p, see also cpl_polynomial_eval_1d().
  For multivariate polynomials the call requires
  n*(1+dim) + d_1 + d_2 + ... + d_dim FLOPs,
  where dim is the dimenstion, n is the number of coefficients in p and d_i is
  the highest power used in dimension i, i = 1, 2, ..., dim.

  Possible #_cpl_error_code_ set in this function:
  - CPL_ERROR_NULL_INPUT if an input pointer is NULL
  - CPL_ERROR_INCOMPATIBLE_INPUT if the length of x differs from the dimension
    of the polynomial
 */
/*----------------------------------------------------------------------------*/
double cpl_polynomial_eval(const cpl_polynomial * p, const cpl_vector * x)
{


    /* Test entries */
    cpl_ensure(p, CPL_ERROR_NULL_INPUT, -1);
    cpl_ensure(x, CPL_ERROR_NULL_INPUT, -2);
    cpl_ensure(p->dim == cpl_vector_get_size(x), CPL_ERROR_INCOMPATIBLE_INPUT,
               -3);

    if (p->dim == 1) {
        return cpl_polynomial_eval_1d(p, cpl_vector_get(x, 0), NULL);
    } else if (p->nc == 0) {
        return 0;
    } else {
        const double * x_val = cpl_vector_get_data_const(x);
        double         z     = 0;
        double         val;
        int            i, j;

        /* Build the table of powers of the evaluation point */
        for (j=0 ; j < p->dim ; j++) {
            /* Allow changes to (the eval-buffer of) p!
               This is acceptable, because the data that is modified
               here is not accessed outside this else-block.
               FIXME: Revise this approach for shared memory parallelism.
            */

            double * eval_powj  = (double *) p->eval_pow[j];

            /* Already done: eval_powj[0] = 1.0; */
            for (i=1 ; i <= p->max_degree[j] ; i++) 
                eval_powj[i] = eval_powj[i-1] * x_val[j];

            cpl_tools_add_flops( p->max_degree[j]);

        }

        /* Evaluate the polynomial using the table.
           If the p->eval_pow does not fit in the cache,
           its memory layout should be changed */
        for (i=0 ; i<p->nc ; i++) {
            /* dim is at least 2 */
            val = p->eval_pow[0][p->pow[p->dim * i + 0]]
                * p->eval_pow[1][p->pow[p->dim * i + 1]];
            for (j=2 ; j < p->dim ; j++) {
                val *= p->eval_pow[j][p->pow[p->dim * i + j]];
            }
            z += p->c[i] * val;
        }
        cpl_tools_add_flops( p->nc * ( 1 + p->dim ) );
        return z;
    }
}


/*----------------------------------------------------------------------------*/
/**
  @brief   Collapse one dimension of a multi-variate polynomial by composition
  @param   self  The multi-variate polynomial
  @param   dim   The dimension to collapse (zero for first dimension)
  @param   other The polynomial to replace dimension dim of self
  @return  The collapsed polynomial or NULL on error

  The dimension of the polynomial self must be one greater than that of the
  other polynomial. Given these two polynomials the dimension dim of self is
  collapsed by creating a new polynomial from
    self(x0, x1, ..., x{dim-1},
         other(x0, x1, ..., x{dim-1}, x{dim+1}, x{dim+2}, ..., x{n-1}),
         x{dim+1}, x{dim+2}, ..., x{n-1}).   

  The created polynomial thus has a dimension which is one less than the
  polynomial self and which is equal to that of the other polynomial.
  Collapsing one dimension of a 1D-polynomial is equivalent to evaluating it,
  which can be done with cpl_polynomial_eval_1d().

  FIXME: The other polynomial must currently have a degree of zero, i.e. it must
  be a constant.

  Currently,
  the call requires dn + p FLOPs, where d the dimension of the polynomial self,
  p is the largest power of dimension dim and n the number of (non-zero)
  coefficients of the polynomial self.

  The returned object is a newly allocated cpl_polynomial that 
  must be deallocated by the caller using cpl_polynomial_delete().

  Possible #_cpl_error_code_ set in this function:
  - CPL_ERROR_NULL_INPUT if an input pointer is NULL
  - CPL_ERROR_INVALID_TYPE if the polynomial is uni-variate.
  - CPL_ERROR_ILLEGAL_INPUT if dim is negative.
  - CPL_ERROR_ACCESS_OUT_OF_RANGE if dim exceeds the dimension of self.
  - CPL_ERROR_INCOMPATIBLE_INPUT if other has the wrong dimension.
  - CPL_ERROR_UNSUPPORTED_MODE if other is not of degree 0.

 */
/*----------------------------------------------------------------------------*/
cpl_polynomial * cpl_polynomial_extract(const cpl_polynomial * self, int dim,
                                        const cpl_polynomial * other)
{

    cpl_polynomial * collapsed;
    double         * eval_powj;
    int            * pows;
    double           x;
    int              newdim;
    int              i;


    cpl_ensure(self  != NULL,    CPL_ERROR_NULL_INPUT,          NULL);
    cpl_ensure(other != NULL,    CPL_ERROR_NULL_INPUT,          NULL);
    cpl_ensure(self->dim > 1,    CPL_ERROR_INVALID_TYPE,        NULL);
    cpl_ensure(dim >= 0,         CPL_ERROR_ILLEGAL_INPUT,       NULL);
    cpl_ensure(dim < self->dim,  CPL_ERROR_ACCESS_OUT_OF_RANGE, NULL);

    newdim = self->dim - 1;

    cpl_ensure(other->dim == newdim, CPL_ERROR_INCOMPATIBLE_INPUT, NULL);

    /* FIXME: Generalize this */
    cpl_ensure(cpl_polynomial_get_degree(other) == 0,
               CPL_ERROR_UNSUPPORTED_MODE, NULL);

    x = other->nc == 0 ? 0.0 : other->c[0];

    collapsed = cpl_polynomial_new(newdim);

    pows = (int*)cpl_malloc(newdim * sizeof(int));

    /* Build the table of powers of the evaluation point */
    /* Allow changes to (the eval-buffer of) self!
       This is acceptable, because the data that is modified
       here is not accessed outside this else-block.
       FIXME: Revise this approach for shared memory parallelism.
    */

    eval_powj  = (double *) self->eval_pow[dim];

    /* Already done: eval_powj[0] = 1.0; */
    for (i=1 ; i <= self->max_degree[dim] ; i++) 
        eval_powj[i] = eval_powj[i-1] * x;

    for (i=0 ; i < self->nc; i++) {
        double coeff = self->c[i];
        int j, k;

        for (k=0, j=0; j < self->dim; j++) {
            if (j == dim) {
                coeff *= eval_powj[self->pow[self->dim * i + j]];
            } else {
                pows[k++] = self->pow[self->dim * i + j];
            }
        }

        coeff += cpl_polynomial_get_coeff(collapsed, pows);

        (void)cpl_polynomial_set_coeff(collapsed, pows, coeff);

    }

    cpl_tools_add_flops( self->max_degree[dim] + self->nc * self->dim );

    cpl_free(pows);

    return collapsed;

}


/*----------------------------------------------------------------------------*/
/**
  @brief    Compute a first order partial derivative
  @param    self  The polynomial to be modified in place
  @param    dim   The dimension to differentiate (zero for first dimension)
  @return   CPL_ERROR_NONE or the relevant #_cpl_error_code_

  The dimension of the polynomial is preserved, even if the operation may cause
  the polynomial to become independent of the dimension dim of the variable.

  The call requires n FLOPs, where n is the number of (non-zero) polynomial
  coefficients whose power in dimension dim is at least 1.

  Possible #_cpl_error_code_ set in this function:
  - CPL_ERROR_NULL_INPUT if an input pointer is NULL
  - CPL_ERROR_ILLEGAL_INPUT if dim is negative.
  - CPL_ERROR_ACCESS_OUT_OF_RANGE if dim exceeds the dimension of self.
 */
/*----------------------------------------------------------------------------*/
cpl_error_code cpl_polynomial_derivative(cpl_polynomial * self, int dim)
{

    int              i;


    cpl_ensure_code(self != NULL,    CPL_ERROR_NULL_INPUT);
    cpl_ensure_code(dim >= 0,        CPL_ERROR_ILLEGAL_INPUT);
    cpl_ensure_code(dim < self->dim, CPL_ERROR_ACCESS_OUT_OF_RANGE);

    if (self->nc == 0) return CPL_ERROR_NONE;

    if (self->dim == 1) {
        for (i=0 ; i < self->nc-1; i++) {
            self->c[i] = self->c[i+1] * (double) (i+1);
        }
        cpl_polynomial_delete_coeff(self, &i);
    } else {

        /* Remove the terms that are constant with respect to dim */
        for (i=0 ; i < self->nc; ) {
            if (self->pow[self->dim * i + dim] == 0) {
                /* FIXME: Memory aliasing occurs here - the contents of the
                   const pointer is actually modified */
                cpl_polynomial_delete_coeff(self, self->pow + self->dim * i);
            } else {
                i++;
            }
        }

        /* At this point self contains the correct number of terms
           which now need to be differentiated */

        for (i=0; i < self->nc; i++) {
            cx_assert(self->pow[self->dim * i + dim] > 0) ;
            self->c[i] *= (double) self->pow[self->dim * i + dim]--;
        }

        if (self->max_degree[dim] > 0) {
            /* Decrement the size of the work-array */

            self->eval_pow[dim] =
                cpl_realloc(self->eval_pow[dim],
                            self->max_degree[dim] * sizeof(double));

            self->max_degree[dim]--;

        }
    }

    cpl_tools_add_flops( self->nc );

    return CPL_ERROR_NONE;

}


/* A nested Horner algorithm can be used to evaluate both differences and
   derivatives.
   Caveat: All four arguments are evaluated more than once */
#define CPL_HORNER_NESTED(A,B,PA,PB)   \
 do {                                  \
        PB = PA;                       \
        while (n>1) {                  \
            PA = PA * A + p->c[--n];   \
            PB = PB * B + PA;          \
        }                              \
        PA = A * PA + p->c[0];         \
                                       \
  } while (0)

/*----------------------------------------------------------------------------*/
/**
  @brief    Evaluate a univariate polynomial using Horners rule.
  @param    p  The 1D-polynomial
  @param    x  The point of evaluation
  @param    pd Iff pd is not null, the derivative p`(x) evaluated at x
  @return   The result or undefined on error.

  A polynomial with no non-zero coefficents evaluates as 0 with a
  derivative that does likewise.

  The result is computed as p_0 + x * ( p_1 + x * ( p_2 + ... x * p_n ))
  and requires 2n FLOPs where n+1 is the number of coefficients in p.

  If the derivative is requested it is computed along with p(x),
  using a nested Horner rule. This requires 4n FLOPs in total.

  Possible #_cpl_error_code_ set in this function:
  - CPL_ERROR_NULL_INPUT if an input pointer is NULL
  - CPL_ERROR_INVALID_TYPE if the polynomial has the wrong dimension
 */
/*----------------------------------------------------------------------------*/
double cpl_polynomial_eval_1d(const cpl_polynomial * p, double x,
                              double * pd)
{

    cpl_long_double result;
    int      n;

    cpl_ensure(p != NULL,   CPL_ERROR_NULL_INPUT, -1);
    cpl_ensure(p->dim == 1, CPL_ERROR_INVALID_TYPE, -3);

    if (p->nc == 0) {
        if (pd != NULL) *pd = 0;
        return 0;
    }

    n = p->nc;

    result = p->c[--n];

    if (pd == NULL) {
        cpl_tools_add_flops( 2 * n );
        while (n) result = x * result + p->c[--n];
    } else {
        cpl_long_double d = 0;

        cpl_tools_add_flops( 4 * n );
        if (n) CPL_HORNER_NESTED(x, x, result, d);
        *pd = d;
    }

    return result;    
}

/*----------------------------------------------------------------------------*/
/**
  @brief    Evaluate a 1D-polynomial on equidistant points using Horners rule
  @param    v  Preallocated vector to contain the result
  @param    p  The 1D-polynomial
  @param    x0 The first point of evaluation
  @param    d  The increment between points of evaluation
  @return   CPL_ERROR_NONE or the relevant #_cpl_error_code_
  @see cpl_vector_fill

  The evaluation points are x_i = x0 + i * d, i=0, 1, ..., n-1,
  where n is the length of the vector.

  If d is zero it is preferable to simply use
  cpl_vector_fill(v, cpl_polynomial_eval_1d(p, x0, NULL)).

  The call requires about 2nm FLOPs, where m+1 is the number of coefficients in
  p.

  Possible #_cpl_error_code_ set in this function:
  - CPL_ERROR_NULL_INPUT if an input pointer is NULL
  - CPL_ERROR_INVALID_TYPE if the polynomial has the wrong dimension
 */
/*----------------------------------------------------------------------------*/
cpl_error_code cpl_vector_fill_polynomial(cpl_vector * v,
                                          const cpl_polynomial * p,
                                          double x0, double d)
{
    int i = cpl_vector_get_size(v);

    cpl_ensure_code(v != NULL,   CPL_ERROR_NULL_INPUT);
    cpl_ensure_code(p != NULL,   CPL_ERROR_NULL_INPUT);
    cpl_ensure_code(p->dim == 1, CPL_ERROR_INVALID_TYPE);

    /* cpl_vector_set() cannot fail in this context */
    do {
        i--;
        cpl_vector_set(v, i,
                       cpl_polynomial_eval_1d(p, x0 + (double)i * d, NULL));
    } while (i > 0);

    cpl_tools_add_flops( 2 * cpl_vector_get_size(v) );

    return CPL_ERROR_NONE;
}

/* Maximum number of Newton-Raphson iterations */
#ifndef CPL_NR_MAXITE
#define CPL_NR_MAXITE 100
#endif
/*----------------------------------------------------------------------------*/
/**
  @brief    A real solution to p(x) = 0 using Newton-Raphsons method
  @param    p    The 1D-polynomial
  @param    x0   First guess of the solution
  @param    px   The solution or undefined on error
  @param    mul  The root multiplicity (or 1 if unknown)
  @return   CPL_ERROR_NONE or the relevant #_cpl_error_code_

  Even if a real solution exists, it may not be found if the first guess is
  too far from the solution. But a solution is guaranteed to be found if all
  roots of p are real. If the constant coefficient is zero, the solution 0
  will returned regardless of the first guess.

  No solution is found and *px is undefined when the iterative process stops
  because:
  1) It can not proceed because p`(x) = 0 (CPL_ERROR_DIVISION_BY_ZERO).
  2) Only a finite number of iterations are allowed. (CPL_ERROR_CONTINUE).
  Both cases may be due to lack of a real solution or a bad first guess.

  The accuracy and robustness deteriorates with increasing multiplicity of the
  solution. This is also the case with numerical multiplicity, i.e. when
  multiple solutions are located close together.

  mul is assumed to be the multiplicity of the solution. Knowledge of the root
  multiplicity often improves the robustnes and accuracy. If there is no
  knowledge of the root multiplicity mul should be 1.
  Setting mul to a too high value should be avoided.

  Reverse order of the coefficients:
  Given x such that p(x) = 0 (p having non-zero constant and leading
  coefficient) then q(1/x) = 0, where q is obtained by reversing the
  order of the coefficients of p.

  Possible #_cpl_error_code_ set in this function:
  - CPL_ERROR_NULL_INPUT if an input pointer is NULL
  - CPL_ERROR_INVALID_TYPE if the polynomial has the wrong dimension
  - CPL_ERROR_ILLEGAL_INPUT if the multiplicity is non-positive
  - CPL_ERROR_DIVISION_BY_ZERO if a division by zero occurs
  - CPL_ERROR_CONTINUE if the algorithm does not converge
 */
/*----------------------------------------------------------------------------*/
cpl_error_code cpl_polynomial_solve_1d(const cpl_polynomial * p,
                                       double x0, double * px, int mul)
{

    /* Initialize to ensure at least one iteration */
    double r = 1;
    double d = 0;
    double xprev = 2 * x0 + 1;
    double rprev;
    double dprev;
    const double mm = mul; /* Root multiplicity */
    int    mite;
    int    i;


    cpl_ensure_code(px != NULL,  CPL_ERROR_NULL_INPUT);
    cpl_ensure_code(p  != NULL,  CPL_ERROR_NULL_INPUT);
    cpl_ensure_code(p->dim == 1, CPL_ERROR_INVALID_TYPE);
    cpl_ensure_code(mul > 0,     CPL_ERROR_ILLEGAL_INPUT);

    /* Iterating towards zero is not as simple as it sounds, so don't */
    if (p->nc == 0 || p->c[0] == 0.0) {
       *px = 0;
       return CPL_ERROR_NONE;
    }

    *px = x0;

    mite = p->nc * CPL_NR_MAXITE;

    for (i = 0; i < mite; i++, xprev = *px) {
     
        rprev = r;
        dprev = d;
        /* Compute residual, r = p(x) and derivative, d = p`(x) */
        r = cpl_polynomial_eval_1d(p, *px, &d);

        /* Stop if:
           1) Correction did not decrease - unless p`(x) changed sign
           2) p`(x) == 0. It is insufficient to implement this as d == 0,
              because some non-zero divisors can still result in inf. */
        if (d * dprev >= 0 && fabs(r * dprev) >= fabs(rprev * d)) break;

        /* Compute and apply the accelerated Newton-Raphson correction */
        *px -= mm * r / d;

        /* *px can become NaN - in which case the iteration goes on
           until the maximum number of iterations is reached.
           In one case this happens because a p'(x) == 0 is pertubed by
           round-off. */

        /* Stop also if:
           3) The correction did not change the solution
              - will typically save at most one Horner-evaluation  */
        if (fabs(*px-xprev) < fabs(*px) * DBL_EPSILON) break;

        /* if x_i == x_j for i > j > 0 the iteration cannot converge.
           This can only happen with at least two sign-changes for p`(x_k)
           i >= k >= j. This is not checked for. */

    }

    cpl_tools_add_flops( i * 12 );

    cpl_ensure_code(i < mite, CPL_ERROR_CONTINUE);

    /* At this point:
       In absence of rounding r or d is zero.
       Due to rounding r or d is zero or close to zero.
       If there is no solution only d is (close to) zero.
       If there is a single solution only r is (close to) zero.
       If there is a multiple solution both r and d are (close to) zero
       - in this case |r| cannot be bigger than |d| because p is one
       degree higher than p'. */

    if (fabs(r) > fabs(d)) {
        /* When d is computed in long double precision at a multiple root,
           |r| _can_ be bigger than |d|.

           Quick fix: Assume that solution is still OK, if |r| is small
           compared to the largest coefficient */

       /* Since r is non-zero, at least one coefficient must be non-zero */
       int n = p->nc;
       double max = 0;
       while (n--) if (fabs(p->c[n]) > max) max = fabs(p->c[n]);

       cpl_ensure_code(fabs(r) <= max * DBL_EPSILON,
                           CPL_ERROR_DIVISION_BY_ZERO);

    }

    return CPL_ERROR_NONE;
    
}
#undef CPL_NR_MAXITE

/*----------------------------------------------------------------------------*/
/**
  @brief    Given p and u, modify the polynomial to p(x) := p(x+u)
  @param    p  The 1D-polynomial to be modified in place
  @param    u  The shift
  @return   CPL_ERROR_NONE or the relevant #_cpl_error_code_

  The call requires n*(n+1) FLOPs, where n is the degree of p.

  The transformation p(x) := (x+1)^n will generate the binomial coefficients,
  p_i =  p_{n-i} = ( n ), i =0, 1, ..., n.
                   ( i )

  Transformation with u = -p_{n-1}/n/p_n will (in absence of rounding errors)
  yield a polynomial with p_{n-1} = 0 and roots that have the sum zero.
  No transformation can modify the leading coefficient.  

  Possible #_cpl_error_code_ set in this function:
  - CPL_ERROR_NULL_INPUT if an input pointer is NULL
  - CPL_ERROR_INVALID_TYPE if the polynomial has the wrong dimension
 */
/*----------------------------------------------------------------------------*/
cpl_error_code cpl_polynomial_shift_1d(cpl_polynomial * p, double u)
{


    cpl_ensure_code(p != NULL,   CPL_ERROR_NULL_INPUT);
    cpl_ensure_code(p->dim == 1, CPL_ERROR_INVALID_TYPE);

    if (p->nc) cpl_polynomial_shift_double( p->c, p->nc, u);

    return CPL_ERROR_NONE;
}

/*----------------------------------------------------------------------------*/
/**
  @brief    Fit a 1D-polynomial to a 1D-signal in a least squares sense
  @param    x_pos   Vector of positions of the signal to fit.
  @param    values  Vector of values of the signal to fit.
  @param    degree  Non-negative polynomial degree.
  @param    mse     Iff mse is not null, the mean squared error on success
  @return   The fitted polynomial or NULL on error
  @note In the absence of rounding-errors the fit will succeed if more than
        degree distinct data-points are provided, but in reality this may not
        hold. Secondly, an increase in the polynomial degree will normally
        reduce the mean squared error. However, due to rounding errors and
        the limited accuracy of the solver of the normal equations, an increase
        in the polynomial degree may at some point cause the mse to _increase_.
        In some cases this happens with an increase of the polynomial degree
        from 9 to 10.
 
  This function fits a 1D-polynomial to a 1D-signal in a least squares sense.

  The input signal is given in x_pos and values which must be of the same size.

  x_pos must contain more than degree distinct values.

  mse may be NULL. If it is not, *mse is set to the mean squared error on
  success, while it is unchanged on error.

  The fit is done in the following steps:
  1) x_pos is first transformed into xhat = x_pos - mean(x_pos).
  2) The normal equations of the Vandermonde matrix are formed from xhat.
  3) The normal equations are solved using Cholesky factorization.
  4) The resulting polynomial in xhat is transformed back to x_pos.

  The call requires 6MN + N^3/3 + 7/2N^2 + O(M) FLOPs where M is the number
  of data points and where N is the number of polynomial coefficients,
  N = degree + 1.

  Possible #_cpl_error_code_ set in this function:
  - CPL_ERROR_NULL_INPUT if an input pointer is NULL
  - CPL_ERROR_ILLEGAL_INPUT if degree is negative.
  - CPL_ERROR_DATA_NOT_FOUND if the size of x_pos is less than the number of
    coefficients to be determined, degree + 1
  - CPL_ERROR_INCOMPATIBLE_INPUT if x_pos and values have different sizes
  - CPL_ERROR_SINGULAR_MATRIX if x_pos contains too few distinct values.
 */
/*----------------------------------------------------------------------------*/
cpl_polynomial * cpl_polynomial_fit_1d_create(
        const cpl_vector    *   x_pos,
        const cpl_vector    *   values,
        int                     degree,
        double              *   mse)
{
    /* Number of unknowns to determine */
    const int           nc = 1 + degree;
    const int           np = cpl_vector_get_size(x_pos);
    double              mean;
    double              delta, xval;
    cpl_vector      *   xhat;
    cpl_matrix      *   mh;   /* Hankel matrix */
    cpl_matrix      *   mx;
    cpl_polynomial  *   fit;
    int                 i, j;
    cpl_boolean         is_eqdist;
    cpl_error_code      error;
    
    /* Check entries */
    cpl_ensure(np > 0,      cpl_error_get_code(), NULL);
    cpl_ensure(values,      CPL_ERROR_NULL_INPUT, NULL);

    cpl_ensure(cpl_vector_get_size(values) == np, CPL_ERROR_INCOMPATIBLE_INPUT,
               NULL);

    if (degree == 0) {
        /* Handle this as a special case */
        mean = cpl_vector_get_mean(values);
        fit = cpl_polynomial_new(1);
        cpl_polynomial_set_coeff(fit, &degree, mean);
        /* If requested, compute mean squared error */
        if (mse != NULL) *mse =
                cpl_tools_get_variance_double(cpl_vector_get_data_const(values),
                                              np, mean);
        return fit;
    }
    cpl_ensure(degree > 0, CPL_ERROR_ILLEGAL_INPUT,  NULL);
    cpl_ensure(np >= nc,   CPL_ERROR_DATA_NOT_FOUND, NULL);

    /* The Hankel matrix may be singular in such a fashion, that the pivot
       points in its Cholesky decomposition are positive due to rounding errors.
       To ensure that such singular systems are robustly detected, the number of
       distinct sampling points is counted.
    */

    cpl_ensure(!cpl_vector_ensure_distinct(x_pos, nc),
               CPL_ERROR_SINGULAR_MATRIX, NULL);

    /* Try to detect if the x-points are equidistant
       - in which every other skew diagonal of the Hankel matrix is zero */
    xval = cpl_vector_get(x_pos, 1);
    delta = xval - cpl_vector_get(x_pos, 0);
    for (i=1; i < np-1; i++) {
        const double dprev = delta;
        const double xprev = xval;

        xval = cpl_vector_get(x_pos, i+1);
        delta = xval - xprev;
        if (delta != dprev) break;
    }
    is_eqdist = i == np-1;

    /* Transform: xhat = x - mean(x) */
    xhat = cpl_vector_transform_mean(x_pos, &mean);

    cx_assert( xhat != NULL );

    /* Ensure that the center element of xhat is zero */
    /* NB:  This can only be done, when the sampling points are ordered! */
    if (is_eqdist && (np & 1)) cpl_vector_set(xhat, np>>1, 0.0);

    /* Generate Hankel matrix, H = V' * V, where V is the Vandermonde matrix */
    /* FIXME: It is faster and likely more accurate to compute the QR
       factorization of the Vandermonde matrix, QR = V, see
       C.J. Demeure: Fast QR Factorization of Vandermonde Matrices */

    /* mh is initlized only if xhat is not equidistant */
    mh = is_eqdist ? cpl_matrix_new(nc, nc) :
         cpl_matrix_wrap(nc, nc, cpl_malloc(nc * nc * sizeof(double)));
    mx = cpl_matrix_wrap(nc,  1, cpl_malloc(nc *  1 * sizeof(double)));

    cpl_matrix_fill_normal_vandermonde(mh, mx, xhat, is_eqdist, values);

    cpl_vector_delete(xhat);

    error = cpl_matrix_solve_spd(mh, mx);

    cpl_matrix_delete(mh);

    if (error) {
        cpl_matrix_delete(mx);
        cpl_ensure(0, error, NULL);
    }


    /* Scale back - and store coefficients - with leading coefficient first,
       see doxygen of cpl_polynomial_set_coeff() */
    fit = cpl_polynomial_new(1);
    for (j = nc-1; j >= 0; j--) {
        const double coeff = cpl_matrix_get(mx, j, 0);
        
        cpl_polynomial_set_coeff(fit, &j, coeff);

    }

    cpl_matrix_delete(mx);

    /* Shift back */
    cpl_polynomial_shift_1d(fit, -mean);

    /* If requested, compute mean squared error */
    if (mse != NULL) {
        *mse = 0;
        for (i=0; i < np; i++) {
            /* Subtract from the true value, square, accumulate */
            const double residue = cpl_vector_get(values, i)
                - cpl_polynomial_eval_1d(fit, cpl_vector_get(x_pos, i), NULL);
            *mse += residue * residue;
        }
        /* Average the error term */
        *mse /= np;

        cpl_tools_add_flops( 3 * np + 1 );
    }

    return fit;
}

/*----------------------------------------------------------------------------*/
/**
  @brief    Fit a 2D-polynomial to a 2D-surface in a least squares sense
  @param    xy_pos  Bivector  positions of the surface to fit.
  @param    values  Vector of values of the surface to fit.
  @param    degree  Non-negative polynomial degree.
  @param    mse     Iff mse is not null, the mean squared error on success
  @return   The fitted polynomial or NULL on error
  @see cpl_polynomial_fit_1d_create
  @see cpl_polynomial_get_degree
 
  This function fits a 2D-polynomial to a 2D-surface in a least squares sense.

  The input signal is given in xy_pos and values which must be of the same size.

  The size of xy_pos must be at least (degree+1)*(degree+2)/2, which is the
  number of polynomial coefficients to be determined - and xy_pos must contain
  at least that many distinct values.

  mse may be NULL. If it is not, *mse is set to the mean squared error on
  success, while it is unchanged on error.

  Example: For degree=3, the following terms will be computed:
 
  1     x       x^2      x^3
  y     x.y     x^2.y
  y^2   x.y^2
  y^3


  The fit is done in the following steps:
  1) The x-positions are first transformed into
      xhat = x - mean(x), and
     the y-positions are first transformed into
      yhat = y - mean(y).
  2) The Vandermonde matrix is formed from xhat and yhat.
  3) The normal equations of the Vandermonde matrix is solved.
  4) The resulting polynomial in (xhat, yhat) is transformed to back (x,y).

  Warning: An increase in the polynomial degree will normally reduce the
  mean squared error. However, due to rounding errors and the limited accuracy
  of the solver of the normal equations, an increase in the polynomial degree
  may at some point cause the mse to _increase_. In some cases this happens
  with an increase of the polynomial degree from 8 to 9. 

  The call requires MN^2 + N^3/3 + O(MN) FLOPs where M is the number of
  data points and where N is the number of polynomial coefficients,
  N = (degree + 1)*(degree + 2)/2.

  Possible #_cpl_error_code_ set in this function:
  - CPL_ERROR_NULL_INPUT if an input pointer is NULL
  - CPL_ERROR_ILLEGAL_INPUT if degree is negative.
  - CPL_ERROR_DATA_NOT_FOUND if the size of xy_pos is less than the number of
    coefficients to be determined, N.
  - CPL_ERROR_INCOMPATIBLE_INPUT if x_pos and values have different sizes
  - CPL_ERROR_SINGULAR_MATRIX if xy_pos contains too few distinct values
 */
/*----------------------------------------------------------------------------*/
cpl_polynomial * cpl_polynomial_fit_2d_create(
        cpl_bivector     *  xy_pos,
        cpl_vector       *  values,
        int                 degree,
        double           *  mse)
{
    const int        np = cpl_bivector_get_size(xy_pos);
    /* Number of unknowns to determine in one dimension */
    const int        nc1 = 1+degree;
    /* Number of unknowns to determine */
    const int        nc = nc1 * (1 + nc1) / 2;
    cpl_matrix     * mv;   /* The transpose of the Vandermonde matrix */
    cpl_matrix     * mh;   /* Block-Hankel matrix, V'*V */
    cpl_matrix     * mb;
    cpl_matrix     * mx;
    cpl_polynomial * fit;
    double         * coeffs1d;
    cpl_vector     * xhat;
    cpl_vector     * yhat;
    double           xmean;
    double           ymean;
    int              powers[2];
    int              degx, degy;
    int              i, j;
    cpl_error_code      error;
   

    /* Check entries */
    cpl_ensure(np > 0,      cpl_error_get_code(), NULL);
    cpl_ensure(values,      CPL_ERROR_NULL_INPUT, NULL);

    cpl_ensure(cpl_vector_get_size(values) == np, CPL_ERROR_INCOMPATIBLE_INPUT,
               NULL);

    if (degree == 0) {
        /* Handle this as a special case */
        powers[0] = powers[1] = 0;
        /* Copy-paste from 1D 0-degree */
        xmean = cpl_vector_get_mean(values);
        fit = cpl_polynomial_new(2);
        cpl_polynomial_set_coeff(fit, powers, xmean);
        /* If requested, compute mean squared error */
        if (mse != NULL)
            *mse = cpl_tools_get_variance_double(cpl_vector_get_data(values),
                                                 np, xmean);
        return fit;
    }

    cpl_ensure(degree > 0, CPL_ERROR_ILLEGAL_INPUT,  NULL);
    cpl_ensure(np >= nc,   CPL_ERROR_DATA_NOT_FOUND, NULL);

    /* Transform: xhat = x - mean(x) */
    xhat = cpl_vector_transform_mean(cpl_bivector_get_x(xy_pos), &xmean);
    cx_assert( xhat != NULL );

    /* Transform: yhat = y - mean(y) */
    yhat = cpl_vector_transform_mean(cpl_bivector_get_y(xy_pos), &ymean);
    cx_assert( yhat != NULL );

    /* Initialize matrices */
    /* mv contains the polynomial terms in the order described */
    /* above in each row, for each input point. */
    mv = cpl_matrix_wrap(nc, np, (double*)cpl_malloc(nc*np*sizeof(double)));

    /* Has redundant FLOPs, appears to improve accuracy */
    for (i=0 ; i < np ; i++) {
        const double x = cpl_vector_get(xhat, i);
        const double y = cpl_vector_get(yhat, i);
        double xvalue;
        double yvalue = 1;
        j = 0;
        for (degy = 0; degy <= degree; degy++) {
            xvalue = 1;
            for (degx = 0; degx <= degree-degy; degx++, j++) {
                cpl_matrix_set(mv, j, i, xvalue * yvalue);
                xvalue *= x;
            }
            yvalue *= y;
        }
        cx_assert( j == nc );
    }
    cpl_tools_add_flops( np * (nc * 2 + nc1));

    cpl_vector_delete(xhat);
    cpl_vector_delete(yhat);

    /* mb contains the values */
    mb = cpl_matrix_wrap(np, 1, cpl_vector_get_data(values));

    /* Form the right hand side of the normal equations */
    mx = cpl_matrix_product_create(mv, mb);

    cpl_matrix_unwrap(mb);

    /* Form the matrix of the normal equations */
    mh = cpl_matrix_product_normal_create(mv);
    cpl_matrix_delete(mv);

    /* Solve XA=B by a least-square solution (aka pseudo-inverse). */
    error = cpl_matrix_solve_spd(mh, mx);

    cpl_matrix_delete(mh);

    if (error) {
        cpl_matrix_delete(mx);
        cpl_ensure(0, error, NULL);
    }

    /* Store coefficients for output */
    fit = cpl_polynomial_new(2);

    coeffs1d = cpl_malloc((degree+1)*sizeof(double));
    for (degx = 0; degx <= degree; degx++) {
        j = degx;
        i = 0;
        /* Copy the polynomial in y into a contiguous buffer */
        for (degy = 0; degy <= degree-degx; degy++, i++) {
            coeffs1d[i] = cpl_matrix_get(mx, j, 0);
            j += degree+1-degy;
        }
        /* Transform the polynomial in y back */
        cpl_polynomial_shift_double(coeffs1d, degree-degx+1, -ymean);
        j = degx;
        i = 0;
        /* Copy the buffer contents back into the polynomial */
        for (degy = 0; degy <= degree-degx; degy++, i++) {
            cpl_matrix_set(mx, j, 0, coeffs1d[i]);
            j += degree+1-degy;
        }
    }
    
    cpl_free(coeffs1d);
    coeffs1d = cpl_matrix_get_data(mx);

    j = 0;
    for (degy = 0; degy <= degree; degy++) {
        powers[1] = degy;
        /* Transform the polynomial in x back */
        cpl_polynomial_shift_double(coeffs1d+j, degree-degy+1, -xmean);
        for (degx = 0; degx <= degree-degy; degx++, j++) {
            powers[0] = degx;
            cx_assert( coeffs1d[j] == cpl_matrix_get(mx, j, 0) );
            cpl_polynomial_set_coeff(fit, powers, cpl_matrix_get(mx, j, 0));
        }
    }
    cx_assert( j == nc );

    cpl_matrix_delete(mx);

    /* If requested, compute mean squared error */
    if (mse != NULL) {
        const cpl_vector * x_pos = cpl_bivector_get_x(xy_pos);
        const cpl_vector * y_pos = cpl_bivector_get_y(xy_pos);
        cpl_vector * x_val = cpl_vector_new(2);
        double residue;

        *mse = 0;
        for (i=0 ; i<np ; i++) {
            cpl_vector_set(x_val, 0, cpl_vector_get(x_pos, i));
            cpl_vector_set(x_val, 1, cpl_vector_get(y_pos, i));
            /* Subtract from the true value, square, accumulate */
            residue = cpl_vector_get(values, i)-cpl_polynomial_eval(fit, x_val);
            *mse += residue * residue;
        }
        cpl_vector_delete(x_val);
        /* Average the error term */
        *mse /= np;
        cpl_tools_add_flops( 3 * np + 1 );
    }

    return fit;
}


#ifdef CPL_POLYNOMIAL_USE_DIFF_1D
/*----------------------------------------------------------------------------*/
/**
  @brief    Evaluate p(a) - p(b) using Horners rule.
  @param    p   The 1D-polynomial
  @param    a   The evaluation point of the minuend
  @param    b   The evaluation point of the subtrahend
  @param    ppa Iff ppa is not null, p(a)
  @return   The difference or undefined on error

  The call requires about 4n FLOPs where n is the number of coefficients in p,
  which is the same as that required for two separate polynomial evaluations.
  cpl_polynomial_diff_1d() is however more accurate.

  ppa may be NULL. If it is not, *ppa is set to p(a), which is calculated at
  no extra cost.

  The underlying algorithm is the same as that used in cpl_polynomial_eval_1d()
  when the derivative is also requested.

  Possible #_cpl_error_code_ set in this function:
  - CPL_ERROR_NULL_INPUT if an input pointer is NULL
  - CPL_ERROR_INVALID_TYPE if the polynomial has the wrong dimension
 */
/*----------------------------------------------------------------------------*/
double cpl_polynomial_diff_1d(const cpl_polynomial * p, double a,
                              double b, double * ppa)
{


    cpl_long_double diff, pa;
    int      n;


    cpl_ensure(p != NULL,   CPL_ERROR_NULL_INPUT, -1);
    cpl_ensure(p->dim == 1, CPL_ERROR_INVALID_TYPE, -3);

    if (p->nc == 0) {
        if (ppa != NULL) *ppa = 0;
        return 0;
    }

    n = p->nc;

    pa = p->c[--n];

    cpl_tools_add_flops( 4 * n );
    CPL_HORNER_NESTED(a, b, pa, diff);

    if (ppa != NULL) *ppa = pa;

    return diff * (a - b);

}
#endif
#undef CPL_HORNER_NESTED

/**@}*/

/*----------------------------------------------------------------------------*/
/**
  @brief    Delete (set to zero) a coefficient of a polynomial
  @param    in      The polynomial to modify
  @param    pows    The power(s) of the different variables
  @return   CPL_ERROR_NONE or the relevant #_cpl_error_code_
  @see cpl_polynomial_set_coeff()
  @note Passing a row from self->pow as pows causes memory aliasing. This is
        allowed, but pows[] will be modified (or even unallocated) by the call.

  Possible #_cpl_error_code_ set in this function:
  - CPL_ERROR_NULL_INPUT if an input pointer is NULL
  - CPL_ERROR_ILLEGAL_INPUT if pows contains negative values
 */
/*----------------------------------------------------------------------------*/
static cpl_error_code cpl_polynomial_delete_coeff(cpl_polynomial * self,
                                                  const int * pows)
{

    int i, dim;
    int ind;


    cpl_ensure_code(self != NULL, CPL_ERROR_NULL_INPUT);
    cpl_ensure_code(pows != NULL, CPL_ERROR_NULL_INPUT);
    for (dim=0 ; dim < self->dim ; dim++) 
        cpl_ensure_code(pows[dim] >= 0, CPL_ERROR_ILLEGAL_INPUT);

    if (self->dim == 1) {
        /* Handle 1D as a special case */
        if (pows[0] == self->nc-1) {
            /* Leading 1D-coefficient is zero - find the new leading */
            do {
                self->nc--;
            } while (self->nc > 0 && self->c[self->nc - 1] == 0.0);

            if (self->nc > 0)
                self->c = cpl_realloc(self->c, self->nc * sizeof(double));
            else
                cpl_free(self->c);

        } else if (pows[0] < self->nc-1) {
            self->c[pows[0]] = 0.0;
        }
        return CPL_ERROR_NONE;
    }

    /* Find the coeff of the multi-variate polynomial */
    for (i=0 ; i < self->nc ; i++) {
        if (!memcmp(self->pow + self->dim * i, pows, self->dim * sizeof(int)))
            break; /* Found the right combination of powers */
    }
    if (i == self->nc) return CPL_ERROR_NONE;

    /* The coefficient exists: Set it to zero and return */
    ind = i;

    /* Shrink polynomium */
    self->nc--;

    /* Shrink array of powers */
    if (self->nc > 0) {

        /* self->nc now points to the last coefficient in the polynomial */

        /* Reduce (if possible) the length of the power-table */
        for (dim = 0; dim < self->dim; dim++) {
            int new_max;

            if (self->pow[self->dim * ind + dim] < self->max_degree[dim])
                continue;

            /* The max-power of this dim may decrease */

            new_max = 0;        
            for (i=0; i < self->nc && new_max < self->max_degree[dim]; i++) {
                if (self->pow[self->dim * i + dim] > new_max)
                    new_max = self->pow[self->dim * i + dim];
            }

            if (new_max == self->max_degree[dim]) continue;

            /* The max-power of this dim decreases */
            self->max_degree[dim] = new_max;
            self->eval_pow[dim] =
                cpl_realloc(self->eval_pow[dim],
                            (1+self->max_degree[dim]) * sizeof(double));
        }

        if (ind < self->nc) {
            /* Move last coefficient to place of zero-valued one */
            self->c[ind] = self->c[self->nc];

            /* Copy last powers to place of those of the zero-valued one */
            memcpy(self->pow + self->dim * ind, self->pow + self->dim * self->nc,
                   self->dim * sizeof(int));

        }

        self->c = cpl_realloc(self->c, self->nc * sizeof(double));
        self->pow = cpl_realloc(self->pow,
                                self->dim * self->nc * sizeof(int));

    } else {
        cpl_free(self->c);
        cpl_free(self->pow);
        cpl_free(self->max_degree);
        for (dim=0 ; dim < self->dim ; dim++) {
            cpl_free(self->eval_pow[dim]);
        }
        cpl_free(self->eval_pow);
    }

    return CPL_ERROR_NONE;

}

/*----------------------------------------------------------------------------*/
/**
  @brief    Fill the Hankel Matrix H=V'*V, where V is a 1D-Vandermonde matrix
  @param    self      The matrix H
  @param    mx        A right multiplication with V', mx = V' * values
  @param    xhat      The mean-transformed x-values
  @param    is_eqdist True iff xhat contains equidistant points
  @param    values    The values to be interpolated
  @return   void

  self must have its elements initialized to zero iff is_eqdist is true.

 */
/*----------------------------------------------------------------------------*/
static void cpl_matrix_fill_normal_vandermonde(cpl_matrix * self,
                                               cpl_matrix * mx,
                                               const cpl_vector * xhat,
                                               cpl_boolean is_eqdist,
                                               const cpl_vector * values)
{


    cpl_vector * phat = cpl_vector_duplicate(xhat); /* Powers of xhat */
    double     * dhat = cpl_vector_get_data(phat);
    const int    nc   = cpl_matrix_get_ncol(self);
    const int    np   = cpl_vector_get_size(xhat);
    int          i,j;


    cx_assert( nc == cpl_matrix_get_nrow(self) );
    cx_assert( nc == cpl_matrix_get_nrow(mx) );
    cx_assert( 1  == cpl_matrix_get_ncol(mx) );
    cx_assert( np == cpl_vector_get_size(values) );

    /* Fill Hankel matrix from top-left to main skew diagonal
       - on and above (non-skew) main diagonal */
    /* Also compute transpose(V) * b */
    /* Peel off 1st iteration */
    cpl_matrix_set(self, 0, 0, (double)np);
    for (j=1; j < 2; j++) {
        double vsum0 = 0.0;
        double hsum = 0.0;
        double vsum = 0.0;
        int k;
        for (k=0; k < np; k++) {
            const double x = cpl_vector_get(xhat, k);
            const double y = cpl_vector_get(values, k);

            hsum += x;
            vsum += y * x;
            vsum0 += y;
        }
        cpl_matrix_set(mx, 0, 0, vsum0);
        cpl_matrix_set(mx, j, 0, vsum);
        if (is_eqdist) continue;
        k = j;
        for (i=0; i <= k; i++, k--) {
            cpl_matrix_set(self, i, k, hsum);
        }
    }
    for (; j < nc; j++) {
        int k;
        double hsum = 0.0;
        double vsum = 0.0;
        for (k=0; k < np; k++) {
            const double x = cpl_vector_get(xhat, k);
            const double y = cpl_vector_get(values, k);

            dhat[k] *= x;
            hsum += dhat[k];
            vsum += y * dhat[k];
        }
        cpl_matrix_set(mx, j, 0, vsum);
        if (is_eqdist && (j&1)) continue;
        k = j;
        for (i=0; i <= k; i++, k--) {
            cpl_matrix_set(self, i, k, hsum);
        }
    }
    /* Fill remaining Hankel matrix - on and above (non-skew) main diagonal */
    for (i = 1; i < nc; i++) {
        int k;
        double hsum = 0.0;

        if (is_eqdist && ((i+nc)&1)==0) {
            cpl_vector_multiply(phat, xhat);
            continue;
        }

        for (k=0; k < np; k++) {
            const double x = cpl_vector_get(xhat, k);

            dhat[k] *= x;
            hsum += dhat[k];
        }
        k = i;
        for (j = nc-1; k <= j; k++, j--) {
            cpl_matrix_set(self, k, j, hsum);
        }
    }
    
    cpl_tools_add_flops( 6 * np * ( nc - 1) );

    cpl_vector_delete(phat);

}