/* polyserial.c
 *
 * Copyright (C) 2013 Stephane Germain
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or (at
 * your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 */

/**
   \file
   \brief Polyserial correlation.
   \author Stephane Germain <germste@gmail.com>
*/

#include "libirt.h"
#if HAVE_CONFIG_H
#  include <config.h>
#endif
#include <math.h>
#include <gsl/gsl_cdf.h>
#include <gsl/gsl_randist.h>

/**
   \brief Compute the polyserial correlation factor to be multiplied
   to the point-biserial (pearson) correlation.

   @param[in] patterns A vector(subjects) with the responses.
   @param[in] nbr_option The number of option..
   @param[in] offset The value of the forst option.
   @param[in] options_weights The weights of each option.

   If there is no weights or the weights are not strictly increasing
   or there is only two options, then the biserial correlation factor
   is calculated (i.e. the response are corrected with the option
   having the biggest weight.

   \return The polyserial correlation factor.
*/
double 
polyserial_factor(gsl_vector_int * patterns, int nbr_option, int offset, gsl_vector * options_weights)
{
  int resp, j, k, nbr_subject = patterns->size, nbr_complete = 0, cumsum = 0, count, force_bis = 0, good_opt;
  gsl_vector_int * opt_count = gsl_vector_int_alloc(nbr_option);
  double rho = 0, mean = 0, sd = 0, prop, qnorm, dnorm, w_cur, w_prev, w_max;

  /* check if the weight are striclty increasing */
  if (options_weights != NULL) {
    w_prev = gsl_vector_get(options_weights, 0);
    w_max = w_prev;
    good_opt = 0;
    for (k = 1; k < nbr_option; k++)
      {
	w_cur = gsl_vector_get(options_weights, k);
	if (w_cur <= w_prev) force_bis = 1;
	w_prev = w_cur;
	if (w_cur >= w_max) {
	  w_max = w_cur;
	  good_opt = k;
	}
      }
  }
  if (force_bis) {
    good_opt += offset;
    nbr_option = 2;
  }

  /* compute the counts of each options */
  gsl_vector_int_set_all(opt_count, 0);
  for (j = 0; j < nbr_subject; j++)
    {
      resp = gsl_vector_int_get(patterns, j);
      if (resp != BLANK) {
	nbr_complete++;
	if (force_bis) resp = (resp == good_opt);
	else resp -= offset;
	count = gsl_vector_int_get(opt_count, resp);
	count++;
	gsl_vector_int_set(opt_count, resp, count);
      }
    }

  /* compute the mean and accumulate the pdf at the cuts */
  for (k = 0; k < nbr_option; k++)
    {
      count = gsl_vector_int_get(opt_count, k);
      mean += count * (k + offset);
      cumsum += count;
      if (k < nbr_option - 1) {
	prop = (VERY_SMALL_PROB+cumsum) / (VERY_SMALL_FREQ+nbr_complete);
	qnorm = gsl_cdf_ugaussian_Pinv(prop);
	dnorm = gsl_ran_ugaussian_pdf(qnorm);
	rho += dnorm;
      }
    }
  if (nbr_complete > 0) mean /= nbr_complete;

  /* compute the sd */
  for (k = 0; k < nbr_option; k++)
    {
      count = gsl_vector_int_get(opt_count, k);
      sd += count * (k + offset - mean) * (k + offset - mean);
    }
  if (nbr_complete > 1) sd = sqrt(sd/(nbr_complete - 1.0));

  if (nbr_complete > 1 && rho > 0) {
    rho = sqrt((nbr_complete - 1.0) / nbr_complete) * sd / rho;
  } else {
    rho = 1;
  }

  gsl_vector_int_free(opt_count);

  return rho;
}

