;+
; NAME:
;    ks_onesample
;
; PURPOSE:
;    This function performs the one-sample Kolmogorov-Smirnov test.
;
; CATEGORY:
;    Statistics
;
; CALLING SEQUENCE:
;    result = ks_onesample( sample_data )
;
; INPUT:
;    SAMPLE_DATA:  A required float vector of length N_DATA containing the 
;        sample data.
;
; KEYWORD PARAMETERS:
;    METHOD:  A required scalar string specifying the method.  Supported values 
;        are:
;        * 'monte carlo':  The statistics are calculated using a brute-force 
;          Monte Carlo method, resampling the reference distribution.
;        * 'standard':  Solving the (truncated) Kolmogorov formula directly, 
;          without any shortcuts.
;    REF_DATA:  An optional float vector of length N_DATA containing the 
;        cumulative distribution function of the reference distribution at the 
;        locations sampled in SAMPLE_DATA.  If not input then REF_NAME must be 
;        input.  This does not work with the Monte Carlo method.
;    REF_NAME:  An optional scalar string naming the reference distribution.  
;        If not input then REF_DATA must be.  Supported values are:
;        * 'uniform':  The uniform distribution over the [0,1] range.
;        * 'gaussian':  The unit Gaussian distribution.
;    SEED:  An optional scalar integer specifying the seed for the Monte 
;        Carlo random number generator.
;    WEIGHTS:  An optional float vector of length N_DATA containing weights 
;        for each of the N_DATA values in SAMPLE_DATA.  This only works with 
;        the Monte Carlo method.  The default is identical weights.
;
; OUTPUT:
;    RESULT:  A float scalar returning the signficance level from the 
;        Kolmogorov-Smirnoff test.
;
; USES:
;    prob_ks.pro
;
; PROCEDURE:
;    This function compares the distribution of the data in SAMPLE_DATA with 
;    the reference distribution specified using REF_DATA or REF_NAME.  The 
;    calculation follows the method described at 
;     https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test.
;
; EXAMPLE:
;    ; Evaluate whether a data vector generated with the random uniform 
;    ; distribution generator is in fact uniformly distributed.
;    data = randomu( seed, 100 )
;    print, ks_onesample( data, method='standard', ref_name='uniform' )
;
; MODIFICATION HISTORY:
;    Written by:  Daithi A. Stone (dastone@runbox.com), 2020-11-03
;-

;***********************************************************************

FUNCTION KS_ONESAMPLE, $
    SAMPLE_DATA, $
    METHOD=method, $
    REF_DATA=ref_data, REF_NAME=ref_name, $
    SEED=seed, $
    WEIGHTS=weights

;***********************************************************************
; Constants and options

; Confirm consistent inputs
n_data = n_elements( sample_data )
if n_data le 1 then stop
if min( finite( sample_data ) ) eq 0 then stop
if n_elements( ref_data ) ne n_data then begin
  if not( keyword_set( ref_name ) ) then stop
endif else begin
  if min( finite( ref_data ) ) eq 0 then stop
endelse

; Ensure a method is specified
if n_elements( method ) ne 1 then stop
if max( method eq [ 'monte carlo', 'standard' ] ) ne 1 then stop
; Settings for the Monte Carlo method
if method eq 'monte carlo' then begin
  ; This required a functionalised distribution generator
  if not( keyword_set( ref_name ) ) then stop
  ; The Monte Carlo sample size
  n_mc = 10000
endif
; Settings for the standard method
if method eq 'standard' then begin
  ; The truncation on the summation series
  n_trunc = 100
  ; Weighting is not supported for this method
  if keyword_set( weights ) then stop
endif

; Normalise weights
if keyword_set( weights ) then weights = weights / total( weights )

;***********************************************************************
; Define cumulative distribution functions

; Calculate the empirical cumulative distribution function for the sampled data
id_sort = sort( sample_data )
cdf_x = sample_data[id_sort]
if keyword_set( weights ) then begin
  cdf_y_sample = ( total( weights[id_sort], cumulative=1 ) $
      +total( [ 0., weights[id_sort[0:n_data-2]] ], cumulative=1 ) ) / 2.
endif else begin
  cdf_y_sample = ( findgen( n_data ) + 0.5 ) / n_data
endelse
; Sort the reference distribution
if keyword_set( ref_dist ) then begin
  cdf_y_ref = ref_dist[id_sort]
endif

; Calculate the reference distribution
if keyword_set( ref_name ) then begin
  ; The uniform distribution
  if ref_name eq 'uniform' then begin
    ; The sample data must be bounded
    if ( min( sample_data ) lt 0 ) or ( max( sample_data ) gt 1 ) then stop
    ; Define the reference CDF at the sampled locations
    cdf_y_ref = cdf_x
  ; The Gaussian distribution
  endif else if ref_name eq 'gaussian' then begin
    ; Define the reference CDF at the sampled locations
    cdf_y_ref = gauss_pdf( cdf_x )
  ; Otherwise not supported
  endif else begin
    stop
  endelse
; Otherwise adopt the input distribution
endif else begin
  cdf_y_ref = ref_data
endelse

;***********************************************************************
; Perform the K-S test

; Calculate the K-S statistic
ks_stat = max( abs( cdf_y_sample - cdf_y_ref ), id )
;ks_stat_x = cdf_x[id]

; If we are using the Monte Carlo method
if method eq 'monte carlo' then begin
  ; Initialise vector of Monte Carlo K-S statistics
  mc_ks_stat = fltarr( n_mc )
  ;mc_ks_stat_x = fltarr( n_mc )
  ; Iterate through Monte Carlo samples
  for i_mc = 0, n_mc - 1 do begin
    ; Generate a random sample from the uniform distribution
    if ref_name eq 'uniform' then begin
      sample_data_mc = randomu( seed, n_data )
    ; Generate a random sample from the Gaussian distribution
    endif else if ref_name eq 'gaussian' then begin
      sample_data_mc = randomn( seed, n_data )
    ; Otherwise this distribution is not supported
    endif else begin
      stop
    endelse
    ; Sort the Monte Carlo sample
    id = sort( sample_data_mc )
    mc_cdf_x = sample_data_mc[id]
    if keyword_set( weights ) then begin
      mc_cdf_y_sample = ( total( weights[id], cumulative=1 ) $
          +total( [ 0., weights[id[0:n_data-2]] ], cumulative=1 ) ) / 2.
    endif else begin
      mc_cdf_y_sample = cdf_y_sample
    endelse
    ; The uniform distribution
    if ref_name eq 'uniform' then begin
      ; Define the reference CDF at the sampled locations
      mc_cdf_y_ref = mc_cdf_x
    ; The Gaussian distribution
    endif else if ref_name eq 'gaussian' then begin
      ; Define the reference CDF at the sampled locations
      mc_cdf_y_ref = gauss_pdf( mc_cdf_x )
    ; Otherwise not supported
    endif else begin
      stop
    endelse
    ; Calculate the K-S statistic
    mc_ks_stat[i_mc] = max( abs( mc_cdf_y_sample - mc_cdf_y_ref ), id )
    ;mc_ks_stat_x[i_mc] = mc_cdf_x[id]
  endfor
  ; Determine the location of the input data within the Monte Carlo samples
  id = sort( mc_ks_stat )
  mc_ks_stat = mc_ks_stat[id]
  id = max( where( mc_ks_stat le ks_stat, n_id ) )
  if n_id eq 0 then begin
    p_value = 1. / ( n_mc + 1 ) / 2.
  endif else if id eq n_mc - 1 then begin
    p_value = n_mc / ( n_mc + 1 ) / 2.
  endif else begin
    p_value = ( ks_stat - mc_ks_stat[id] ) $
        / ( mc_ks_stat[id+1] - mc_ks_stat[id] ) * ( id + 1. ) $
        + ( mc_ks_stat[id+1] - ks_stat ) $
        / ( mc_ks_stat[id+1] - mc_ks_stat[id] ) * id
    p_value = ( p_value + 1. ) / ( n_mc + 1. )
  endelse
; For the standard solution
endif else if method eq 'standard' then begin
  ; Perform the calculation in double precision.
  ; Define the summation index
  index_sum = 1 + dindgen( n_trunc )
  ; Set the location in the distribution, using adjustment for improved accuracy
  ; (see https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test)
  temp = sqrt( n_data ) * ks_stat
  temp = temp + 1.d / ( 6.d * sqrt( n_data ) ) $
      + ( temp - 1.d ) / ( 4.d * n_data )
  ; Calculate the p-value
  p_value = sqrt( 2.d * !dpi ) / temp $
      * total( exp( -( ( 2.d * index_sum -1.d ) ^ 2.d ) * ( !dpi ^ 2.d ) $
    / ( 8.d * ( temp ^ 2.d ) ) ) )
  p_value = 1. - p_value
; Otherwise the requested method is not yet supported
endif else begin
  stop
endelse


;***********************************************************************
; The end

return, p_value
END