001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.statistics.inference; 019 020import java.util.Arrays; 021import java.util.Objects; 022import java.util.function.DoubleSupplier; 023import java.util.function.DoubleUnaryOperator; 024import java.util.function.IntToDoubleFunction; 025import org.apache.commons.numbers.combinatorics.BinomialCoefficientDouble; 026import org.apache.commons.numbers.combinatorics.Factorial; 027import org.apache.commons.numbers.core.ArithmeticUtils; 028import org.apache.commons.numbers.core.Sum; 029import org.apache.commons.rng.UniformRandomProvider; 030 031/** 032 * Implements the Kolmogorov-Smirnov (K-S) test for equality of continuous distributions. 033 * 034 * <p>The one-sample test uses a D statistic based on the maximum deviation of the empirical 035 * distribution of sample data points from the distribution expected under the null hypothesis. 036 * 037 * <p>The two-sample test uses a D statistic based on the maximum deviation of the two empirical 038 * distributions of sample data points. The two-sample tests evaluate the null hypothesis that 039 * the two samples {@code x} and {@code y} come from the same underlying distribution. 040 * 041 * <p>References: 042 * <ol> 043 * <li> 044 * Marsaglia, G., Tsang, W. W., & Wang, J. (2003). 045 * <a href="https://doi.org/10.18637/jss.v008.i18">Evaluating Kolmogorov's Distribution.</a> 046 * Journal of Statistical Software, 8(18), 1–4.</li> 047 * <li>Simard, R., & L’Ecuyer, P. (2011). 048 * <a href="https://doi.org/10.18637/jss.v039.i11">Computing the Two-Sided Kolmogorov-Smirnov Distribution.</a> 049 * Journal of Statistical Software, 39(11), 1–18.</li> 050 * <li>Sekhon, J. S. (2011). 051 * <a href="https://doi.org/10.18637/jss.v042.i07"> 052 * Multivariate and Propensity Score Matching Software with Automated Balance Optimization: 053 * The Matching package for R.</a> 054 * Journal of Statistical Software, 42(7), 1–52.</li> 055 * <li>Viehmann, T (2021). 056 * <a href="https://doi.org/10.48550/arXiv.2102.08037"> 057 * Numerically more stable computation of the p-values for the two-sample Kolmogorov-Smirnov test.</a> 058 * arXiv:2102.08037</li> 059 * <li>Hodges, J. L. (1958). 060 * <a href="https://doi.org/10.1007/BF02589501"> 061 * The significance probability of the smirnov two-sample test.</a> 062 * Arkiv for Matematik, 3(5), 469-486.</li> 063 * </ol> 064 * 065 * <p>Note that [1] contains an error in computing h, refer to <a 066 * href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details. 067 * 068 * @see <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> 069 * Kolmogorov-Smirnov (K-S) test (Wikipedia)</a> 070 * @since 1.1 071 */ 072public final class KolmogorovSmirnovTest { 073 /** Name for sample 1. */ 074 private static final String SAMPLE_1_NAME = "Sample 1"; 075 /** Name for sample 2. */ 076 private static final String SAMPLE_2_NAME = "Sample 2"; 077 /** When the largest sample size exceeds this value, 2-sample test AUTO p-value 078 * uses an asymptotic distribution to compute the p-value. */ 079 private static final int LARGE_SAMPLE = 10000; 080 /** Maximum finite factorial. */ 081 private static final int MAX_FACTORIAL = 170; 082 /** Maximum length of an array. This is used to determine if two arrays can be concatenated 083 * to create a sampler from the joint distribution. The limit is copied from the limit 084 * of java.util.ArrayList. */ 085 private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8; 086 /** The maximum least common multiple (lcm) to attempt the exact p-value computation. 087 * The integral d value is in [0, n*m] in steps of the greatest common denominator (gcd), 088 * thus lcm = n*m/gcd is the number of possible different p-values. 089 * Some methods have a lower limit due to computation limits. This should be larger 090 * than LARGE_SAMPLE^2 so all AUTO p-values attempt an exact computation, i.e. 091 * at least 10000^2 ~ 2^26.56. */ 092 private static final long MAX_LCM_TWO_SAMPLE_EXACT_P = 1L << 31; 093 /** Placeholder to use for the two-sample sign array when the value can be ignored. */ 094 private static final int[] IGNORED_SIGN = new int[1]; 095 /** Placeholder to use for the two-sample ties D array when the value can be ignored. */ 096 private static final long[] IGNORED_D = new long[2]; 097 /** Default instance. */ 098 private static final KolmogorovSmirnovTest DEFAULT = new KolmogorovSmirnovTest( 099 AlternativeHypothesis.TWO_SIDED, PValueMethod.AUTO, false, null, 1000); 100 101 /** Alternative hypothesis. */ 102 private final AlternativeHypothesis alternative; 103 /** Method to compute the p-value. */ 104 private final PValueMethod pValueMethod; 105 /** Use a strict inequality for the two-sample exact p-value. */ 106 private final boolean strictInequality; 107 /** Source of randomness. */ 108 private final UniformRandomProvider rng; 109 /** Number of iterations . */ 110 private final int iterations; 111 112 /** 113 * Result for the one-sample Kolmogorov-Smirnov test. 114 * 115 * <p>This class is immutable. 116 * 117 * @since 1.1 118 */ 119 public static class OneResult extends BaseSignificanceResult { 120 /** Sign of the statistic. */ 121 private final int sign; 122 123 /** 124 * Create an instance. 125 * 126 * @param statistic Test statistic. 127 * @param sign Sign of the statistic. 128 * @param p Result p-value. 129 */ 130 OneResult(double statistic, int sign, double p) { 131 super(statistic, p); 132 this.sign = sign; 133 } 134 135 /** 136 * Gets the sign of the statistic. This is 1 for \(D^+\) and -1 for \(D^-\). 137 * For a two sided-test this is zero if the magnitude of \(D^+\) and \(D^-\) was equal; 138 * otherwise the sign indicates the larger of \(D^+\) or \(D^-\). 139 * 140 * @return the sign 141 */ 142 public int getSign() { 143 return sign; 144 } 145 } 146 147 /** 148 * Result for the two-sample Kolmogorov-Smirnov test. 149 * 150 * <p>This class is immutable. 151 * 152 * @since 1.1 153 */ 154 public static final class TwoResult extends OneResult { 155 /** Flag to indicate there were significant ties. 156 * Note that in extreme cases there may be significant ties despite {@code upperD == D} 157 * due to rounding when converting the integral statistic to a double. For this 158 * reason the presence of ties is stored as a flag. */ 159 private final boolean significantTies; 160 /** Upper bound of the D statistic from all possible paths through regions with ties. */ 161 private final double upperD; 162 /** The p-value of the upper D value. */ 163 private final double upperP; 164 165 /** 166 * Create an instance. 167 * 168 * @param statistic Test statistic. 169 * @param sign Sign of the statistic. 170 * @param p Result p-value. 171 * @param significantTies Flag to indicate there were significant ties. 172 * @param upperD Upper bound of the D statistic from all possible paths through 173 * regions with ties. 174 * @param upperP The p-value of the upper D value. 175 */ 176 TwoResult(double statistic, int sign, double p, boolean significantTies, double upperD, double upperP) { 177 super(statistic, sign, p); 178 this.significantTies = significantTies; 179 this.upperD = upperD; 180 this.upperP = upperP; 181 } 182 183 /** 184 * {@inheritDoc} 185 * 186 * <p><strong>Ties</strong> 187 * 188 * <p>The presence of ties in the data creates a distribution for the D values generated 189 * by all possible orderings of the tied regions. This statistic is computed using the 190 * path with the <em>minimum</em> effect on the D statistic. 191 * 192 * <p>For a one-sided statistic \(D^+\) or \(D^-\), this is the lower bound of the D statistic. 193 * 194 * <p>For a two-sided statistic D, this may be <em>below</em> the lower bound of the 195 * distribution of all possible D values. This case occurs when the number of ties 196 * is very high and is identified by a {@link #getPValue() p-value} of 1. 197 * 198 * <p>If the two-sided statistic is zero this only occurs in the presence of ties: 199 * either the two arrays are identical, are 'identical' data of a single value 200 * (sample sizes may be different), or have a sequence of ties of 'identical' data 201 * with a net zero effect on the D statistic, e.g. 202 * <pre> 203 * [1,2,3] vs [1,2,3] 204 * [0,0,0,0] vs [0,0,0] 205 * [0,0,0,0,1,1,1,1] vs [0,0,0,1,1,1] 206 * </pre> 207 */ 208 @Override 209 public double getStatistic() { 210 // Note: This method is here for documentation 211 return super.getStatistic(); 212 } 213 214 /** 215 * Returns {@code true} if there were ties between samples that occurred 216 * in a region which could change the D statistic if the ties were resolved to 217 * a defined order. 218 * 219 * <p>Ties between the data can be interpreted as if the values were different 220 * but within machine epsilon. In this case the order within the tie region is not known. 221 * If the most extreme ordering of any tied regions (e.g. all tied values of {@code x} 222 * before all tied values of {@code y}) could create a larger D statistic this 223 * method will return {@code true}. 224 * 225 * <p>If there were no ties, or all possible orderings of tied regions create the same 226 * D statistic, this method returns {@code false}. 227 * 228 * <p>Note it is possible that this method returns {@code true} when {@code D == upperD} 229 * due to rounding when converting the computed D statistic to a double. This will 230 * only occur for large sample sizes {@code n} and {@code m} where the product 231 * {@code n*m >= 2^53}. 232 * 233 * @return true if the D statistic could be changed by resolution of ties 234 * @see #getUpperD() 235 */ 236 public boolean hasSignificantTies() { 237 return significantTies; 238 } 239 240 /** 241 * Return the upper bound of the D statistic from all possible paths through regions with ties. 242 * 243 * <p>This will return a value equal to or greater than {@link #getStatistic()}. 244 * 245 * @return the upper bound of D 246 * @see #hasSignificantTies() 247 */ 248 public double getUpperD() { 249 return upperD; 250 } 251 252 /** 253 * Return the p-value of the upper bound of the D statistic. 254 * 255 * <p>If computed, this will return a value equal to or less than 256 * {@link #getPValue() getPValue}. It may be orders of magnitude smaller. 257 * 258 * <p>Note: This p-value corresponds to the most extreme p-value from all possible 259 * orderings of tied regions. It is <strong>not</strong> recommended to use this to 260 * reject the null hypothesis. The upper bound of D and the corresponding p-value 261 * provide information that must be interpreted in the context of the sample data and 262 * used to inform a decision on the suitability of the test to the data. 263 * 264 * <p>This value is set to {@link Double#NaN NaN} if the {@link #getPValue() p-value} was 265 * {@linkplain PValueMethod#ESTIMATE estimated}. The estimated p-value will have been created 266 * using a distribution of possible D values given the underlying joint distribution of 267 * the sample data. Comparison of the p-value to the upper p-value is not applicable. 268 * 269 * @return the p-value of the upper bound of D (or NaN) 270 * @see #getUpperD() 271 */ 272 public double getUpperPValue() { 273 return upperP; 274 } 275 } 276 277 /** 278 * @param alternative Alternative hypothesis. 279 * @param method P-value method. 280 * @param strict true to use a strict inequality. 281 * @param rng Source of randomness. 282 * @param iterations Number of iterations. 283 */ 284 private KolmogorovSmirnovTest(AlternativeHypothesis alternative, PValueMethod method, boolean strict, 285 UniformRandomProvider rng, int iterations) { 286 this.alternative = alternative; 287 this.pValueMethod = method; 288 this.strictInequality = strict; 289 this.rng = rng; 290 this.iterations = iterations; 291 } 292 293 /** 294 * Return an instance using the default options. 295 * 296 * <ul> 297 * <li>{@link AlternativeHypothesis#TWO_SIDED}</li> 298 * <li>{@link PValueMethod#AUTO}</li> 299 * <li>{@link Inequality#NON_STRICT}</li> 300 * <li>{@linkplain #with(UniformRandomProvider) RNG = none}</li> 301 * <li>{@linkplain #withIterations(int) Iterations = 1000}</li> 302 * </ul> 303 * 304 * @return default instance 305 */ 306 public static KolmogorovSmirnovTest withDefaults() { 307 return DEFAULT; 308 } 309 310 /** 311 * Return an instance with the configured alternative hypothesis. 312 * 313 * @param v Value. 314 * @return an instance 315 */ 316 public KolmogorovSmirnovTest with(AlternativeHypothesis v) { 317 return new KolmogorovSmirnovTest(Objects.requireNonNull(v), pValueMethod, strictInequality, rng, iterations); 318 } 319 320 /** 321 * Return an instance with the configured p-value method. 322 * 323 * <p>For the one-sample two-sided test Kolmogorov's asymptotic approximation can be used; 324 * otherwise the p-value uses the distribution of the D statistic. 325 * 326 * <p>For the two-sample test the exact p-value can be computed for small sample sizes; 327 * otherwise the p-value resorts to the asymptotic approximation. Alternatively a p-value 328 * can be estimated from the combined distribution of the samples. This requires a source 329 * of randomness. 330 * 331 * @param v Value. 332 * @return an instance 333 * @see #with(UniformRandomProvider) 334 */ 335 public KolmogorovSmirnovTest with(PValueMethod v) { 336 return new KolmogorovSmirnovTest(alternative, Objects.requireNonNull(v), strictInequality, rng, iterations); 337 } 338 339 /** 340 * Return an instance with the configured inequality. 341 * 342 * <p>Computes the p-value for the two-sample test as \(P(D_{n,m} > d)\) if strict; 343 * otherwise \(P(D_{n,m} \ge d)\), where \(D_{n,m}\) is the 2-sample 344 * Kolmogorov-Smirnov statistic, either the two-sided \(D_{n,m}\) or one-sided 345 * \(D_{n,m}^+\) or \(D_{n,m}^-\). 346 * 347 * @param v Value. 348 * @return an instance 349 */ 350 public KolmogorovSmirnovTest with(Inequality v) { 351 return new KolmogorovSmirnovTest(alternative, pValueMethod, 352 Objects.requireNonNull(v) == Inequality.STRICT, rng, iterations); 353 } 354 355 /** 356 * Return an instance with the configured source of randomness. 357 * 358 * <p>Applies to the two-sample test when the p-value method is set to 359 * {@link PValueMethod#ESTIMATE ESTIMATE}. The randomness 360 * is used for sampling of the combined distribution. 361 * 362 * <p>There is no default source of randomness. If the randomness is not 363 * set then estimation is not possible and an {@link IllegalStateException} will be 364 * raised in the two-sample test. 365 * 366 * @param v Value. 367 * @return an instance 368 * @see #with(PValueMethod) 369 */ 370 public KolmogorovSmirnovTest with(UniformRandomProvider v) { 371 return new KolmogorovSmirnovTest(alternative, pValueMethod, strictInequality, 372 Objects.requireNonNull(v), iterations); 373 } 374 375 /** 376 * Return an instance with the configured number of iterations. 377 * 378 * <p>Applies to the two-sample test when the p-value method is set to 379 * {@link PValueMethod#ESTIMATE ESTIMATE}. This is the number of sampling iterations 380 * used to estimate the p-value. The p-value is a fraction using the {@code iterations} 381 * as the denominator. The number of significant digits in the p-value is 382 * upper bounded by log<sub>10</sub>(iterations); small p-values have fewer significant 383 * digits. A large number of iterations is recommended when using a small critical 384 * value to reject the null hypothesis. 385 * 386 * @param v Value. 387 * @return an instance 388 * @throws IllegalArgumentException if the number of iterations is not strictly positive 389 */ 390 public KolmogorovSmirnovTest withIterations(int v) { 391 return new KolmogorovSmirnovTest(alternative, pValueMethod, strictInequality, rng, 392 Arguments.checkStrictlyPositive(v)); 393 } 394 395 /** 396 * Computes the one-sample Kolmogorov-Smirnov test statistic. 397 * 398 * <ul> 399 * <li>two-sided: \(D_n=\sup_x |F_n(x)-F(x)|\)</li> 400 * <li>greater: \(D_n^+=\sup_x (F_n(x)-F(x))\)</li> 401 * <li>less: \(D_n^-=\sup_x (F(x)-F_n(x))\)</li> 402 * </ul> 403 * 404 * <p>where \(F\) is the distribution cumulative density function ({@code cdf}), 405 * \(n\) is the length of {@code x} and \(F_n\) is the empirical distribution that puts 406 * mass \(1/n\) at each of the values in {@code x}. 407 * 408 * <p>The cumulative distribution function should map a real value {@code x} to a probability 409 * in [0, 1]. To use a reference distribution the CDF can be passed using a method reference: 410 * <pre> 411 * UniformContinuousDistribution dist = UniformContinuousDistribution.of(0, 1); 412 * UniformRandomProvider rng = RandomSource.KISS.create(123); 413 * double[] x = dist.sampler(rng).samples(100); 414 * double d = KolmogorovSmirnovTest.withDefaults().statistic(x, dist::cumulativeProbability); 415 * </pre> 416 * 417 * @param cdf Reference cumulative distribution function. 418 * @param x Sample being evaluated. 419 * @return Kolmogorov-Smirnov statistic 420 * @throws IllegalArgumentException if {@code data} does not have length at least 2; or contains NaN values. 421 * @see #test(double[], DoubleUnaryOperator) 422 */ 423 public double statistic(double[] x, DoubleUnaryOperator cdf) { 424 return computeStatistic(x, cdf, IGNORED_SIGN); 425 } 426 427 /** 428 * Computes the two-sample Kolmogorov-Smirnov test statistic. 429 * 430 * <ul> 431 * <li>two-sided: \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)</li> 432 * <li>greater: \(D_{n,m}^+=\sup_x (F_n(x)-F_m(x))\)</li> 433 * <li>less: \(D_{n,m}^-=\sup_x (F_m(x)-F_n(x))\)</li> 434 * </ul> 435 * 436 * <p>where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the 437 * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\) 438 * is the empirical distribution that puts mass \(1/m\) at each of the values in {@code y}. 439 * 440 * @param x First sample. 441 * @param y Second sample. 442 * @return Kolmogorov-Smirnov statistic 443 * @throws IllegalArgumentException if either {@code x} or {@code y} does not 444 * have length at least 2; or contain NaN values. 445 * @see #test(double[], double[]) 446 */ 447 public double statistic(double[] x, double[] y) { 448 final int n = checkArrayLength(x); 449 final int m = checkArrayLength(y); 450 // Clone to avoid destructive modification of input 451 final long dnm = computeIntegralKolmogorovSmirnovStatistic(x.clone(), y.clone(), 452 IGNORED_SIGN, IGNORED_D); 453 // Re-use the method to compute D in [0, 1] for consistency 454 return computeD(dnm, n, m, ArithmeticUtils.gcd(n, m)); 455 } 456 457 /** 458 * Performs a one-sample Kolmogorov-Smirnov test evaluating the null hypothesis 459 * that {@code x} conforms to the distribution cumulative density function ({@code cdf}). 460 * 461 * <p>The test is defined by the {@link AlternativeHypothesis}: 462 * <ul> 463 * <li>Two-sided evaluates the null hypothesis that the two distributions are 464 * identical, \(F_n(i) = F(i)\) for all \( i \); the alternative is that the are not 465 * identical. The statistic is \( max(D_n^+, D_n^-) \) and the sign of \( D \) is provided.</li> 466 * <li>Greater evaluates the null hypothesis that the \(F_n(i) <= F(i)\) for all \( i \); 467 * the alternative is \(F_n(i) > F(i)\) for at least one \( i \). The statistic is \( D_n^+ \).</li> 468 * <li>Less evaluates the null hypothesis that the \(F_n(i) >= F(i)\) for all \( i \); 469 * the alternative is \(F_n(i) < F(i)\) for at least one \( i \). The statistic is \( D_n^- \).</li> 470 * </ul> 471 * 472 * <p>The p-value method defaults to exact. The one-sided p-value uses Smirnov's stable formula: 473 * 474 * <p>\[ P(D_n^+ \ge x) = x \sum_{j=0}^{\lfloor n(1-x) \rfloor} \binom{n}{j} \left(\frac{j}{n} + x\right)^{j-1} \left(1-x-\frac{j}{n} \right)^{n-j} \] 475 * 476 * <p>The two-sided p-value is computed using methods described in 477 * Simard & L’Ecuyer (2011). The two-sided test supports an asymptotic p-value 478 * using Kolmogorov's formula: 479 * 480 * <p>\[ \lim_{n\to\infty} P(\sqrt{n}D_n > z) = 2 \sum_{i=1}^\infty (-1)^{i-1} e^{-2 i^2 z^2} \] 481 * 482 * @param x Sample being being evaluated. 483 * @param cdf Reference cumulative distribution function. 484 * @return test result 485 * @throws IllegalArgumentException if {@code data} does not have length at least 2; or contains NaN values. 486 * @see #statistic(double[], DoubleUnaryOperator) 487 */ 488 public OneResult test(double[] x, DoubleUnaryOperator cdf) { 489 final int[] sign = {0}; 490 final double d = computeStatistic(x, cdf, sign); 491 final double p; 492 if (alternative == AlternativeHypothesis.TWO_SIDED) { 493 PValueMethod method = pValueMethod; 494 if (method == PValueMethod.AUTO) { 495 // No switch to the asymptotic for large n 496 method = PValueMethod.EXACT; 497 } 498 if (method == PValueMethod.ASYMPTOTIC) { 499 // Kolmogorov's asymptotic formula using z = sqrt(n) * d 500 p = KolmogorovSmirnovDistribution.ksSum(Math.sqrt(x.length) * d); 501 } else { 502 // exact 503 p = KolmogorovSmirnovDistribution.Two.sf(d, x.length); 504 } 505 } else { 506 // one-sided: always use exact 507 p = KolmogorovSmirnovDistribution.One.sf(d, x.length); 508 } 509 return new OneResult(d, sign[0], p); 510 } 511 512 /** 513 * Performs a two-sample Kolmogorov-Smirnov test on samples {@code x} and {@code y}. 514 * Test the empirical distributions \(F_n\) and \(F_m\) where \(n\) is the length 515 * of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the empirical distribution 516 * that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\) is the empirical 517 * distribution that puts mass \(1/m\) of the {@code y} values. 518 * 519 * <p>The test is defined by the {@link AlternativeHypothesis}: 520 * <ul> 521 * <li>Two-sided evaluates the null hypothesis that the two distributions are 522 * identical, \(F_n(i) = F_m(i)\) for all \( i \); the alternative is that they are not 523 * identical. The statistic is \( max(D_n^+, D_n^-) \) and the sign of \( D \) is provided.</li> 524 * <li>Greater evaluates the null hypothesis that the \(F_n(i) <= F_m(i)\) for all \( i \); 525 * the alternative is \(F_n(i) > F_m(i)\) for at least one \( i \). The statistic is \( D_n^+ \).</li> 526 * <li>Less evaluates the null hypothesis that the \(F_n(i) >= F_m(i)\) for all \( i \); 527 * the alternative is \(F_n(i) < F_m(i)\) for at least one \( i \). The statistic is \( D_n^- \).</li> 528 * </ul> 529 * 530 * <p>If the {@linkplain PValueMethod p-value method} is auto, then an exact p computation 531 * is attempted if both sample sizes are less than 10000 using the methods presented in 532 * Viehmann (2021) and Hodges (1958); otherwise an asymptotic p-value is returned. 533 * The two-sided p-value is \(\overline{F}(d, \sqrt{mn / (m + n)})\) where \(\overline{F}\) 534 * is the complementary cumulative density function of the two-sided one-sample 535 * Kolmogorov-Smirnov distribution. The one-sided p-value uses an approximation from 536 * Hodges (1958) Eq 5.3. 537 * 538 * <p>\(D_{n,m}\) has a discrete distribution. This makes the p-value associated with the 539 * null hypothesis \(H_0 : D_{n,m} \gt d \) differ from \(H_0 : D_{n,m} \ge d \) 540 * by the mass of the observed value \(d\). These can be distinguished using an 541 * {@link Inequality} parameter. This is ignored for large samples. 542 * 543 * <p>If the data contains ties there is no defined ordering in the tied region to use 544 * for the difference between the two empirical distributions. Each ordering of the 545 * tied region <em>may</em> create a different D statistic. All possible orderings 546 * generate a distribution for the D value. In this case the tied region is traversed 547 * entirely and the effect on the D value evaluated at the end of the tied region. 548 * This is the path with the least change on the D statistic. The path with the 549 * greatest change on the D statistic is also computed as the upper bound on D. 550 * If these two values are different then the tied region is known to generate a 551 * distribution for the D statistic and the p-value is an over estimate for the cases 552 * with a larger D statistic. The presence of any significant tied regions is returned 553 * in the result. 554 * 555 * <p>If the p-value method is {@link PValueMethod#ESTIMATE ESTIMATE} then the p-value is 556 * estimated by repeat sampling of the joint distribution of {@code x} and {@code y}. 557 * The p-value is the frequency that a sample creates a D statistic 558 * greater than or equal to (or greater than for strict inequality) the observed value. 559 * In this case a source of randomness must be configured or an {@link IllegalStateException} 560 * will be raised. The p-value for the upper bound on D will not be estimated and is set to 561 * {@link Double#NaN NaN}. This estimation procedure is not affected by ties in the data 562 * and is increasingly robust for larger datasets. The method is modeled after 563 * <a href="https://sekhon.berkeley.edu/matching/ks.boot.html">ks.boot</a> 564 * in the R {@code Matching} package (Sekhon (2011)). 565 * 566 * @param x First sample. 567 * @param y Second sample. 568 * @return test result 569 * @throws IllegalArgumentException if either {@code x} or {@code y} does not 570 * have length at least 2; or contain NaN values. 571 * @throws IllegalStateException if the p-value method is {@link PValueMethod#ESTIMATE ESTIMATE} 572 * and there is no source of randomness. 573 * @see #statistic(double[], double[]) 574 */ 575 public TwoResult test(double[] x, double[] y) { 576 final int n = checkArrayLength(x); 577 final int m = checkArrayLength(y); 578 PValueMethod method = pValueMethod; 579 final int[] sign = {0}; 580 final long[] tiesD = {0, 0}; 581 582 final double[] sx = x.clone(); 583 final double[] sy = y.clone(); 584 final long dnm = computeIntegralKolmogorovSmirnovStatistic(sx, sy, sign, tiesD); 585 586 // Compute p-value. Note that the p-value is not invalidated by ties; it is the 587 // D statistic that could be invalidated by resolution of the ties. So compute 588 // the exact p even if ties are present. 589 if (method == PValueMethod.AUTO) { 590 // Use exact for small samples 591 method = Math.max(n, m) < LARGE_SAMPLE ? 592 PValueMethod.EXACT : 593 PValueMethod.ASYMPTOTIC; 594 } 595 final int gcd = ArithmeticUtils.gcd(n, m); 596 final double d = computeD(dnm, n, m, gcd); 597 final boolean significantTies = tiesD[1] > dnm; 598 final double d2 = significantTies ? computeD(tiesD[1], n, m, gcd) : d; 599 600 final double p; 601 final double p2; 602 603 // Allow bootstrap estimation of the p-value 604 if (method == PValueMethod.ESTIMATE) { 605 p = estimateP(sx, sy, dnm); 606 p2 = Double.NaN; 607 } else { 608 final boolean exact = method == PValueMethod.EXACT; 609 p = twoSampleP(dnm, n, m, gcd, d, exact); 610 if (significantTies) { 611 // Compute the upper bound on D. 612 // The p-value is also computed. The alternative is to save the options 613 // in the result with (upper dnm, n, m) and compute it on-demand. 614 // Note detection of whether the exact P computation is possible is based on 615 // n and m, thus this will use the same computation. 616 p2 = twoSampleP(tiesD[1], n, m, gcd, d2, exact); 617 } else { 618 p2 = p; 619 } 620 } 621 return new TwoResult(d, sign[0], p, significantTies, d2, p2); 622 } 623 624 /** 625 * Estimates the <i>p-value</i> of a two-sample Kolmogorov-Smirnov test evaluating the 626 * null hypothesis that {@code x} and {@code y} are samples drawn from the same 627 * probability distribution. 628 * 629 * <p>This method will destructively modify the input arrays (via a sort). 630 * 631 * <p>This method estimates the p-value by repeatedly sampling sets of size 632 * {@code x.length} and {@code y.length} from the empirical distribution 633 * of the combined sample. The memory requirement is an array copy of each of 634 * the input arguments. 635 * 636 * <p>When using strict inequality, this is equivalent to the algorithm implemented in 637 * the R function {@code ks.boot} as described in Sekhon (2011) Journal of Statistical 638 * Software, 42(7), 1–52 [3]. 639 * 640 * @param x First sample. 641 * @param y Second sample. 642 * @param dnm Integral D statistic. 643 * @return p-value 644 * @throws IllegalStateException if the source of randomness is null. 645 */ 646 private double estimateP(double[] x, double[] y, long dnm) { 647 if (rng == null) { 648 throw new IllegalStateException("No source of randomness"); 649 } 650 651 // Test if the random statistic is greater (strict), or greater or equal to d 652 final long d = strictInequality ? dnm : dnm - 1; 653 654 final long plus; 655 final long minus; 656 if (alternative == AlternativeHypothesis.GREATER_THAN) { 657 plus = d; 658 minus = Long.MIN_VALUE; 659 } else if (alternative == AlternativeHypothesis.LESS_THAN) { 660 plus = Long.MAX_VALUE; 661 minus = -d; 662 } else { 663 // two-sided 664 plus = d; 665 minus = -d; 666 } 667 668 // Test dnm=0. This occurs for example when x == y. 669 if (0 < minus || 0 > plus) { 670 // Edge case where all possible d will be outside the inclusive bounds 671 return 1; 672 } 673 674 // Sample randomly with replacement from the combined distribution. 675 final DoubleSupplier gen = createSampler(x, y, rng); 676 int count = 0; 677 for (int i = iterations; i > 0; i--) { 678 for (int j = 0; j < x.length; j++) { 679 x[j] = gen.getAsDouble(); 680 } 681 for (int j = 0; j < y.length; j++) { 682 y[j] = gen.getAsDouble(); 683 } 684 if (testIntegralKolmogorovSmirnovStatistic(x, y, plus, minus)) { 685 count++; 686 } 687 } 688 return count / (double) iterations; 689 } 690 691 /** 692 * Computes the magnitude of the one-sample Kolmogorov-Smirnov test statistic. 693 * The sign of the statistic is optionally returned in {@code sign}. For the two-sided case 694 * the sign is 0 if the magnitude of D+ and D- was equal; otherwise it indicates which D 695 * had the larger magnitude. 696 * 697 * @param x Sample being evaluated. 698 * @param cdf Reference cumulative distribution function. 699 * @param sign Sign of the statistic (non-zero length). 700 * @return Kolmogorov-Smirnov statistic 701 * @throws IllegalArgumentException if {@code data} does not have length at least 2; 702 * or contains NaN values. 703 */ 704 private double computeStatistic(double[] x, DoubleUnaryOperator cdf, int[] sign) { 705 final int n = checkArrayLength(x); 706 final double nd = n; 707 final double[] sx = sort(x.clone(), "Sample"); 708 // Note: ties in the data do not matter as we compare the empirical CDF 709 // immediately before the value (i/n) and at the value (i+1)/n. For ties 710 // of length m this would be (i-m+1)/n and (i+1)/n and the result is the same. 711 double d = 0; 712 if (alternative == AlternativeHypothesis.GREATER_THAN) { 713 // Compute D+ 714 for (int i = 0; i < n; i++) { 715 final double yi = cdf.applyAsDouble(sx[i]); 716 final double dp = (i + 1) / nd - yi; 717 d = dp > d ? dp : d; 718 } 719 sign[0] = 1; 720 } else if (alternative == AlternativeHypothesis.LESS_THAN) { 721 // Compute D- 722 for (int i = 0; i < n; i++) { 723 final double yi = cdf.applyAsDouble(sx[i]); 724 final double dn = yi - i / nd; 725 d = dn > d ? dn : d; 726 } 727 sign[0] = -1; 728 } else { 729 // Two sided. 730 // Compute both (as unsigned) and return the sign indicating the largest result. 731 double plus = 0; 732 double minus = 0; 733 for (int i = 0; i < n; i++) { 734 final double yi = cdf.applyAsDouble(sx[i]); 735 final double dn = yi - i / nd; 736 final double dp = (i + 1) / nd - yi; 737 minus = dn > minus ? dn : minus; 738 plus = dp > plus ? dp : plus; 739 } 740 sign[0] = Double.compare(plus, minus); 741 d = Math.max(plus, minus); 742 } 743 return d; 744 } 745 746 /** 747 * Computes the two-sample Kolmogorov-Smirnov test statistic. The statistic is integral 748 * and has a value in {@code [0, n*m]}. Not all values are possible; the smallest 749 * increment is the greatest common divisor of {@code n} and {@code m}. 750 * 751 * <p>This method will destructively modify the input arrays (via a sort). 752 * 753 * <p>The sign of the statistic is returned in {@code sign}. For the two-sided case 754 * the sign is 0 if the magnitude of D+ and D- was equal; otherwise it indicates which D 755 * had the larger magnitude. If the two-sided statistic is zero the two arrays are 756 * identical, or are 'identical' data of a single value (sample sizes may be different), 757 * or have a sequence of ties of 'identical' data with a net zero effect on the D statistic 758 * e.g. 759 * <pre> 760 * [1,2,3] vs [1,2,3] 761 * [0,0,0,0] vs [0,0,0] 762 * [0,0,0,0,1,1,1,1] vs [0,0,0,1,1,1] 763 * </pre> 764 * 765 * <p>This method detects ties in the input data. Not all ties will invalidate the returned 766 * statistic. Ties between the data can be interpreted as if the values were different 767 * but within machine epsilon. In this case the path through the tie region is not known. 768 * All paths through the tie region end at the same point. This method will compute the 769 * most extreme path through each tie region and the D statistic for these paths. If the 770 * ties D statistic is a larger magnitude than the returned D statistic then at least 771 * one tie region lies at a point on the full path that could result in a different 772 * statistic in the absence of ties. This signals the P-value computed using the returned 773 * D statistic is one of many possible p-values given the data and how ties are resolved. 774 * Note: The tiesD value may be smaller than the returned D statistic as it is not set 775 * to the maximum of D or tiesD. The only result of interest is when {@code tiesD > D} 776 * due to a tie region that can change the output D. On output {@code tiesD[0] != 0} 777 * if there were ties between samples and {@code tiesD[1] = D} of the most extreme path 778 * through the ties. 779 * 780 * @param x First sample (destructively modified by sort). 781 * @param y Second sample (destructively modified by sort). 782 * @param sign Sign of the statistic (non-zero length). 783 * @param tiesD Integral statistic for the most extreme path through any ties (length at least 2). 784 * @return integral Kolmogorov-Smirnov statistic 785 * @throws IllegalArgumentException if either {@code x} or {@code y} contain NaN values. 786 */ 787 private long computeIntegralKolmogorovSmirnovStatistic(double[] x, double[] y, int[] sign, long[] tiesD) { 788 // Sort the sample arrays 789 sort(x, SAMPLE_1_NAME); 790 sort(y, SAMPLE_2_NAME); 791 final int n = x.length; 792 final int m = y.length; 793 794 // CDFs range from 0 to 1 using increments of 1/n and 1/m for x and y respectively. 795 // Scale by n*m to use increments of m and n for x and y. 796 // Find the max difference between cdf_x and cdf_y. 797 int i = 0; 798 int j = 0; 799 long d = 0; 800 long plus = 0; 801 long minus = 0; 802 // Ties: store the D+,D- for most extreme path though tie region(s) 803 long tplus = 0; 804 long tminus = 0; 805 do { 806 // No NaNs so compare using < and > 807 if (x[i] < y[j]) { 808 final double z = x[i]; 809 do { 810 i++; 811 d += m; 812 } while (i < n && x[i] == z); 813 plus = d > plus ? d : plus; 814 } else if (x[i] > y[j]) { 815 final double z = y[j]; 816 do { 817 j++; 818 d -= n; 819 } while (j < m && y[j] == z); 820 minus = d < minus ? d : minus; 821 } else { 822 // Traverse to the end of the tied section for d. 823 // Also compute the most extreme path through the tied region. 824 final double z = x[i]; 825 final long dd = d; 826 int k = i; 827 do { 828 i++; 829 } while (i < n && x[i] == z); 830 k = i - k; 831 d += k * (long) m; 832 // Extreme D+ path 833 tplus = d > tplus ? d : tplus; 834 k = j; 835 do { 836 j++; 837 } while (j < m && y[j] == z); 838 k = j - k; 839 d -= k * (long) n; 840 // Extreme D- path must start at the original d 841 tminus = Math.min(tminus, dd - k * (long) n); 842 // End of tied section 843 if (d > plus) { 844 plus = d; 845 } else if (d < minus) { 846 minus = d; 847 } 848 } 849 } while (i < n && j < m); 850 // The presence of any ties is flagged by a non-zero value for D+ or D-. 851 // Note we cannot use the selected tiesD value as in the one-sided case it may be zero 852 // and the non-selected D value will be non-zero. 853 tiesD[0] = tplus | tminus; 854 // For simplicity the correct tiesD is not returned (correct value is commented). 855 // The only case that matters is tiesD > D which is evaluated by the caller. 856 // Note however that the distance of tiesD < D is a measure of how little the 857 // tied region matters. 858 if (alternative == AlternativeHypothesis.GREATER_THAN) { 859 sign[0] = 1; 860 // correct = max(tplus, plus) 861 tiesD[1] = tplus; 862 return plus; 863 } else if (alternative == AlternativeHypothesis.LESS_THAN) { 864 sign[0] = -1; 865 // correct = -min(tminus, minus) 866 tiesD[1] = -tminus; 867 return -minus; 868 } else { 869 // Two sided. 870 sign[0] = Double.compare(plus, -minus); 871 d = Math.max(plus, -minus); 872 // correct = max(d, max(tplus, -tminus)) 873 tiesD[1] = Math.max(tplus, -tminus); 874 return d; 875 } 876 } 877 878 /** 879 * Test if the two-sample integral Kolmogorov-Smirnov statistic is strictly greater 880 * than the specified values for D+ and D-. Note that D- should have a negative sign 881 * to impose an inclusive lower bound. 882 * 883 * <p>This method will destructively modify the input arrays (via a sort). 884 * 885 * <p>For a two-sided alternative hypothesis {@code plus} and {@code minus} should have the 886 * same magnitude with opposite signs. 887 * 888 * <p>For a one-sided alternative hypothesis the value of {@code plus} or {@code minus} 889 * should have the expected value of the statistic, and the opposite D should have the maximum 890 * or minimum long value. The {@code minus} should be negatively signed: 891 * 892 * <ul> 893 * <li>greater: {@code plus} = D, {@code minus} = {@link Long#MIN_VALUE}</li> 894 * <li>greater: {@code minus} = -D, {@code plus} = {@link Long#MAX_VALUE}</li> 895 * </ul> 896 * 897 * <p>Note: This method has not been specialized for the one-sided case. Specialization 898 * would eliminate a conditional branch for {@code d > Long.MAX_VALUE} or 899 * {@code d < Long.MIN_VALUE}. Since these branches are never possible in the one-sided case 900 * this should be efficiently chosen by branch prediction in a processor pipeline. 901 * 902 * @param x First sample (destructively modified by sort; must not contain NaN). 903 * @param y Second sample (destructively modified by sort; must not contain NaN). 904 * @param plus Limit on D+ (inclusive upper bound). 905 * @param minus Limit on D- (inclusive lower bound). 906 * @return true if the D value exceeds the provided limits 907 */ 908 private static boolean testIntegralKolmogorovSmirnovStatistic(double[] x, double[] y, long plus, long minus) { 909 // Sort the sample arrays 910 Arrays.sort(x); 911 Arrays.sort(y); 912 final int n = x.length; 913 final int m = y.length; 914 915 // CDFs range from 0 to 1 using increments of 1/n and 1/m for x and y respectively. 916 // Scale by n*m to use increments of m and n for x and y. 917 // Find the any difference that exceeds the specified bounds. 918 int i = 0; 919 int j = 0; 920 long d = 0; 921 do { 922 // No NaNs so compare using < and > 923 if (x[i] < y[j]) { 924 final double z = x[i]; 925 do { 926 i++; 927 d += m; 928 } while (i < n && x[i] == z); 929 if (d > plus) { 930 return true; 931 } 932 } else if (x[i] > y[j]) { 933 final double z = y[j]; 934 do { 935 j++; 936 d -= n; 937 } while (j < m && y[j] == z); 938 if (d < minus) { 939 return true; 940 } 941 } else { 942 // Traverse to the end of the tied section for d. 943 final double z = x[i]; 944 do { 945 i++; 946 d += m; 947 } while (i < n && x[i] == z); 948 do { 949 j++; 950 d -= n; 951 } while (j < m && y[j] == z); 952 // End of tied section 953 if (d > plus || d < minus) { 954 return true; 955 } 956 } 957 } while (i < n && j < m); 958 // Note: Here d requires returning to zero. This is applicable to the one-sided 959 // cases since d may have always been above zero (favours D+) or always below zero 960 // (favours D-). This is ignored as the method is not called when dnm=0 is 961 // outside the inclusive bounds. 962 return false; 963 } 964 965 /** 966 * Creates a sampler to sample randomly from the combined distribution of the two samples. 967 * 968 * @param x First sample. 969 * @param y Second sample. 970 * @param rng Source of randomness. 971 * @return the sampler 972 */ 973 private static DoubleSupplier createSampler(double[] x, double[] y, 974 UniformRandomProvider rng) { 975 return createSampler(x, y, rng, MAX_ARRAY_SIZE); 976 } 977 978 /** 979 * Creates a sampler to sample randomly from the combined distribution of the two 980 * samples. This will copy the input data for the sampler. 981 * 982 * @param x First sample. 983 * @param y Second sample. 984 * @param rng Source of randomness. 985 * @param maxArraySize Maximum size of a single array. 986 * @return the sampler 987 */ 988 static DoubleSupplier createSampler(double[] x, double[] y, 989 UniformRandomProvider rng, 990 int maxArraySize) { 991 final int n = x.length; 992 final int m = y.length; 993 final int len = n + m; 994 // Overflow safe: len > maxArraySize 995 if (len - maxArraySize > 0) { 996 // Support sampling with maximum length arrays 997 // (where a concatenated array is not possible) 998 // by choosing one or the other. 999 // - generate i in [-n, m) 1000 // - return i < 0 ? x[n + i] : y[i] 1001 // The sign condition is a 50-50 branch. 1002 // Perform branchless by extracting the sign bit to pick the array. 1003 // Copy the source data. 1004 final double[] xx = x.clone(); 1005 final double[] yy = y.clone(); 1006 final IntToDoubleFunction nextX = i -> xx[n + i]; 1007 final IntToDoubleFunction nextY = i -> yy[i]; 1008 // Arrange function which accepts the negative index at position [1] 1009 final IntToDoubleFunction[] next = {nextY, nextX}; 1010 return () -> { 1011 final int i = rng.nextInt(-n, m); 1012 return next[i >>> 31].applyAsDouble(i); 1013 }; 1014 } 1015 // Concatenate arrays 1016 final double[] z = new double[len]; 1017 System.arraycopy(x, 0, z, 0, n); 1018 System.arraycopy(y, 0, z, n, m); 1019 return () -> z[rng.nextInt(len)]; 1020 } 1021 1022 /** 1023 * Compute the D statistic from the integral D value. 1024 * 1025 * @param dnm Integral D-statistic value (in [0, n*m]). 1026 * @param n First sample size. 1027 * @param m Second sample size. 1028 * @param gcd Greatest common divisor of n and m. 1029 * @return D-statistic value (in [0, 1]). 1030 */ 1031 private static double computeD(long dnm, int n, int m, int gcd) { 1032 // Note: Integer division using the gcd is intentional 1033 final long a = dnm / gcd; 1034 final int b = m / gcd; 1035 return a / ((double) n * b); 1036 } 1037 1038 /** 1039 * Computes \(P(D_{n,m} > d)\) for the 2-sample Kolmogorov-Smirnov statistic. 1040 * 1041 * @param dnm Integral D-statistic value (in [0, n*m]). 1042 * @param n First sample size. 1043 * @param m Second sample size. 1044 * @param gcd Greatest common divisor of n and m. 1045 * @param d D-statistic value (in [0, 1]). 1046 * @param exact whether to compute the exact probability; otherwise approximate. 1047 * @return probability 1048 * @see #twoSampleExactP(long, int, int, int, boolean, boolean) 1049 * @see #twoSampleApproximateP(double, int, int, boolean) 1050 */ 1051 private double twoSampleP(long dnm, int n, int m, int gcd, double d, boolean exact) { 1052 // Exact computation returns -1 if it cannot compute. 1053 double p = -1; 1054 if (exact) { 1055 p = twoSampleExactP(dnm, n, m, gcd, strictInequality, alternative == AlternativeHypothesis.TWO_SIDED); 1056 } 1057 if (p < 0) { 1058 p = twoSampleApproximateP(d, n, m, alternative == AlternativeHypothesis.TWO_SIDED); 1059 } 1060 return p; 1061 } 1062 1063 /** 1064 * Computes \(P(D_{n,m} > d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge 1065 * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic, either the two-sided 1066 * \(D_{n,m}\) or one-sided \(D_{n,m}^+\}. See 1067 * {@link #statistic(double[], double[])} for the definition of \(D_{n,m}\). 1068 * 1069 * <p>The returned probability is exact. If the value cannot be computed this returns -1. 1070 * 1071 * <p>Note: This requires the greatest common divisor of n and m. The integral D statistic 1072 * in the range [0, n*m] is separated by increments of the gcd. The method will only 1073 * compute p-values for valid values of D by calculating for D/gcd. 1074 * Strict inquality is performed using the next valid value for D. 1075 * 1076 * @param dnm Integral D-statistic value (in [0, n*m]). 1077 * @param n First sample size. 1078 * @param m Second sample size. 1079 * @param gcd Greatest common divisor of n and m. 1080 * @param strict whether or not the probability to compute is expressed as a strict inequality. 1081 * @param twoSided whether D refers to D or D+. 1082 * @return probability that a randomly selected m-n partition of m + n generates D 1083 * greater than (resp. greater than or equal to) {@code d} (or -1) 1084 */ 1085 static double twoSampleExactP(long dnm, int n, int m, int gcd, boolean strict, boolean twoSided) { 1086 // Create the statistic in [0, lcm] 1087 // For strict inequality D > d the result is the same if we compute for D >= (d+1) 1088 final long d = dnm / gcd + (strict ? 1 : 0); 1089 1090 // P-value methods compute for d <= lcm (least common multiple) 1091 final long lcm = (long) n * (m / gcd); 1092 if (d > lcm) { 1093 return 0; 1094 } 1095 1096 // Note: Some methods require m >= n, others n >= m 1097 final int a = Math.min(n, m); 1098 final int b = Math.max(n, m); 1099 1100 if (twoSided) { 1101 // Any two-sided statistic dnm cannot be less than min(n, m) in the absence of ties. 1102 if (d * gcd <= a) { 1103 return 1; 1104 } 1105 // Here d in [2, lcm] 1106 if (n == m) { 1107 return twoSampleTwoSidedPOutsideSquare(d, n); 1108 } 1109 return twoSampleTwoSidedPStabilizedInner(d, b, a, gcd); 1110 } 1111 // Any one-sided statistic cannot be less than 0 1112 if (d <= 0) { 1113 return 1; 1114 } 1115 // Here d in [1, lcm] 1116 if (n == m) { 1117 return twoSampleOneSidedPOutsideSquare(d, n); 1118 } 1119 return twoSampleOneSidedPOutside(d, a, b, gcd); 1120 } 1121 1122 /** 1123 * Computes \(P(D_{n,m} \ge d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. 1124 * 1125 * <p>The returned probability is exact, implemented using the stabilized inner method 1126 * presented in Viehmann (2021). 1127 * 1128 * <p>This is optimized for {@code m <= n}. If {@code m > n} and index-out-of-bounds 1129 * exception can occur. 1130 * 1131 * @param d Integral D-statistic value (in [2, lcm]) 1132 * @param n Larger sample size. 1133 * @param m Smaller sample size. 1134 * @param gcd Greatest common divisor of n and m. 1135 * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\) 1136 * greater than or equal to {@code d} 1137 */ 1138 private static double twoSampleTwoSidedPStabilizedInner(long d, int n, int m, int gcd) { 1139 // Check the computation is possible. 1140 // Note that the possible paths is binom(m+n, n). 1141 // However the computation is stable above this limit. 1142 // Possible d values (requiring a unique p-value) = max(dnm) / gcd = lcm(n, m). 1143 // Possible terms to compute <= n * size(cij) 1144 // Simple limit based on the number of possible different p-values 1145 if ((long) n * (m / gcd) > MAX_LCM_TWO_SAMPLE_EXACT_P) { 1146 return -1; 1147 } 1148 1149 // This could be updated to use d in [1, lcm]. 1150 // Currently it uses d in [gcd, n*m]. 1151 // Largest intermediate value is (dnm + im + n) which is within 2^63 1152 // if n and m are 2^31-1, i = n, dnm = n*m: (2^31-1)^2 + (2^31-1)^2 + 2^31-1 < 2^63 1153 final long dnm = d * gcd; 1154 1155 // Viehmann (2021): Updated for i in [0, n], j in [0, m] 1156 // C_i,j = 1 if |i/n - j/m| >= d 1157 // = 0 if |i/n - j/m| < d and (i=0 or j=0) 1158 // = C_i-1,j * i/(i+j) + C_i,j-1 * j/(i+j) otherwise 1159 // P2 = C_n,m 1160 // Note: The python listing in Viehmann used d in [0, 1]. This uses dnm in [0, nm] 1161 // so updates the scaling to compute the ranges. Also note that the listing uses 1162 // dist > d or dist < -d where this uses |dist| >= d to compute P(D >= d) (non-strict inequality). 1163 // The provided listing is explicit in the values for each j in the range. 1164 // It can be optimized given the known start and end j for each iteration as only 1165 // j where |i/n - j/m| < d must be processed: 1166 // startJ where: im - jn < dnm : jn > im - dnm 1167 // j = floor((im - dnm) / n) + 1 in [0, m] 1168 // endJ where: jn - im >= dnm 1169 // j = ceil((dnm + im) / n) in [0, m+1] 1170 1171 // First iteration with i = 0 1172 // j = ceil(dnm / n) 1173 int endJ = Math.min(m + 1, (int) ((dnm + n - 1) / n)); 1174 1175 // Only require 1 array to store C_i-1,j as the startJ only ever increases 1176 // and we update lower indices using higher ones. 1177 // The maximum value *written* is j=m or less using j/m <= 2*d : j = ceil(2*d*m) 1178 // Viehmann uses: size = int(2*m*d + 2) == ceil(2*d*m) + 1 1179 // The maximum value *read* is j/m <= 2*d. This may be above m. This occurs when 1180 // j - lastStartJ > m and C_i-1,j = 1. This can be avoided if (startJ - lastStartJ) <= 1 1181 // which occurs if m <= n, i.e. the window only slides 0 or 1 in j for each increment i 1182 // and we can maintain Cij as 1 larger than ceil(2*d*m) + 1. 1183 final double[] cij = new double[Math.min(m + 1, 2 * endJ + 2)]; 1184 1185 // Each iteration fills C_i,j with values and the remaining values are 1186 // kept as 1 for |i/n - j/m| >= d 1187 //assert (endJ - 1) * (long) n < dnm : "jn >= dnm for j < endJ"; 1188 for (int j = endJ; j < cij.length; j++) { 1189 //assert j * (long) n >= dnm : "jn < dnm for j >= endJ"; 1190 cij[j] = 1; 1191 } 1192 1193 int startJ = 0; 1194 int length = endJ; 1195 double val = -1; 1196 long im = 0; 1197 for (int i = 1; i <= n; i++) { 1198 im += m; 1199 final int lastStartJ = startJ; 1200 1201 // Compute C_i,j for startJ <= j < endJ 1202 // startJ = floor((im - dnm) / n) + 1 in [0, m] 1203 // endJ = ceil((dnm + im) / n) in [0, m+1] 1204 startJ = im < dnm ? 0 : Math.min(m, (int) ((im - dnm) / n) + 1); 1205 endJ = Math.min(m + 1, (int) ((dnm + im + n - 1) / n)); 1206 1207 if (startJ >= endJ) { 1208 // No possible paths inside the boundary 1209 return 1; 1210 } 1211 1212 //assert startJ - lastStartJ <= 1 : "startJ - lastStartJ > 1"; 1213 1214 // Initialize previous value C_i,j-1 1215 val = startJ == 0 ? 0 : 1; 1216 1217 //assert startJ == 0 || Math.abs(im - (startJ - 1) * (long) n) >= dnm : "|im - jn| < dnm for j < startJ"; 1218 //assert endJ > m || Math.abs(im - endJ * (long) n) >= dnm : "|im - jn| < dnm for j >= endJ"; 1219 for (int j = startJ; j < endJ; j++) { 1220 //assert j == 0 || Math.abs(im - j * (long) n) < dnm : "|im - jn| >= dnm for startJ <= j < endJ"; 1221 // C_i,j = C_i-1,j * i/(i+j) + C_i,j-1 * j/(i+j) 1222 // Note: if (j - lastStartJ) >= cij.length this creates an IOOB exception. 1223 // In this case cij[j - lastStartJ] == 1. Only happens when m >= n. 1224 // Fixed using c_i-1,j = (j - lastStartJ >= cij.length ? 1 : cij[j - lastStartJ] 1225 val = (cij[j - lastStartJ] * i + val * j) / ((double) i + j); 1226 cij[j - startJ] = val; 1227 } 1228 1229 // Must keep the remaining values in C_i,j as 1 to allow 1230 // cij[j - lastStartJ] * i == i when (j - lastStartJ) > lastLength 1231 final int lastLength = length; 1232 length = endJ - startJ; 1233 for (int j = lastLength - length - 1; j >= 0; j--) { 1234 cij[length + j] = 1; 1235 } 1236 } 1237 // Return the most recently written value: C_n,m 1238 return val; 1239 } 1240 1241 /** 1242 * Computes \(P(D_{n,m}^+ \ge d)\), where \(D_{n,m}^+\) is the 2-sample one-sided 1243 * Kolmogorov-Smirnov statistic. 1244 * 1245 * <p>The returned probability is exact, implemented using the outer method 1246 * presented in Hodges (1958). 1247 * 1248 * <p>This method will fail-fast and return -1 if the computation of the 1249 * numbers of paths overflows. 1250 * 1251 * @param d Integral D-statistic value (in [0, lcm]) 1252 * @param n First sample size. 1253 * @param m Second sample size. 1254 * @param gcd Greatest common divisor of n and m. 1255 * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\) 1256 * greater than or equal to {@code d} 1257 */ 1258 private static double twoSampleOneSidedPOutside(long d, int n, int m, int gcd) { 1259 // Hodges, Fig.2 1260 // Lower boundary: (nx - my)/nm >= d : (nx - my) >= dnm 1261 // B(x, y) is the number of ways from (0, 0) to (x, y) without previously 1262 // reaching the boundary. 1263 // B(x, y) = binom(x+y, y) - [number of ways which previously reached the boundary] 1264 // Total paths: 1265 // sum_y { B(x, y) binom(m+n-x-y, n-y) } 1266 1267 // Normalized by binom(m+n, n). Check this is possible. 1268 final long lm = m; 1269 if (n + lm > Integer.MAX_VALUE) { 1270 return -1; 1271 } 1272 final double binom = binom(m + n, n); 1273 if (binom == Double.POSITIVE_INFINITY) { 1274 return -1; 1275 } 1276 1277 // This could be updated to use d in [1, lcm]. 1278 // Currently it uses d in [gcd, n*m]. 1279 final long dnm = d * gcd; 1280 1281 // Visit all x in [0, m] where (nx - my) >= d for each increasing y in [0, n]. 1282 // x = ceil( (d + my) / n ) = (d + my + n - 1) / n 1283 // y = ceil( (nx - d) / m ) = (nx - d + m - 1) / m 1284 // Note: n m integer, d in [0, nm], the intermediate cannot overflow a long. 1285 // x | y=0 = (d + n - 1) / n 1286 final int x0 = (int) ((dnm + n - 1) / n); 1287 if (x0 >= m) { 1288 return 1 / binom; 1289 } 1290 // The y above is the y *on* the boundary. Set the limit as the next y above: 1291 // y | x=m = 1 + floor( (nx - d) / m ) = 1 + (nm - d) / m 1292 final int maxy = (int) ((n * lm - dnm + m) / m); 1293 // Compute x and B(x, y) for visited B(x,y) 1294 final int[] xy = new int[maxy]; 1295 final double[] bxy = new double[maxy]; 1296 xy[0] = x0; 1297 bxy[0] = 1; 1298 for (int y = 1; y < maxy; y++) { 1299 final int x = (int) ((dnm + lm * y + n - 1) / n); 1300 // B(x, y) = binom(x+y, y) - [number of ways which previously reached the boundary] 1301 // Add the terms to subtract as a negative sum. 1302 final Sum b = Sum.create(); 1303 for (int yy = 0; yy < y; yy++) { 1304 // Here: previousX = x - xy[yy] : previousY = y - yy 1305 // bxy[yy] is the paths to (previousX, previousY) 1306 // binom represent the paths from (previousX, previousY) to (x, y) 1307 b.addProduct(bxy[yy], -binom(x - xy[yy] + y - yy, y - yy)); 1308 } 1309 b.add(binom(x + y, y)); 1310 xy[y] = x; 1311 bxy[y] = b.getAsDouble(); 1312 } 1313 // sum_y { B(x, y) binom(m+n-x-y, n-y) } 1314 final Sum sum = Sum.create(); 1315 for (int y = 0; y < maxy; y++) { 1316 sum.addProduct(bxy[y], binom(m + n - xy[y] - y, n - y)); 1317 } 1318 // No individual term should have overflowed since binom is finite. 1319 // Any sum above 1 is floating-point error. 1320 return KolmogorovSmirnovDistribution.clipProbability(sum.getAsDouble() / binom); 1321 } 1322 1323 /** 1324 * Computes \(P(D_{n,n}^+ \ge d)\), where \(D_{n,n}^+\) is the 2-sample one-sided 1325 * Kolmogorov-Smirnov statistic. 1326 * 1327 * <p>The returned probability is exact, implemented using the outer method 1328 * presented in Hodges (1958). 1329 * 1330 * @param d Integral D-statistic value (in [1, lcm]) 1331 * @param n Sample size. 1332 * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\) 1333 * greater than or equal to {@code d} 1334 */ 1335 private static double twoSampleOneSidedPOutsideSquare(long d, int n) { 1336 // Hodges (1958) Eq. 2.3: 1337 // p = binom(2n, n-a) / binom(2n, n) 1338 // a in [1, n] == d * n == dnm / n 1339 final int a = (int) d; 1340 1341 // Rearrange: 1342 // p = ( 2n! / ((n-a)! (n+a)!) ) / ( 2n! / (n! n!) ) 1343 // = n! n! / ( (n-a)! (n+a)! ) 1344 // Perform using pre-computed factorials if possible. 1345 if (n + a <= MAX_FACTORIAL) { 1346 final double x = Factorial.doubleValue(n); 1347 final double y = Factorial.doubleValue(n - a); 1348 final double z = Factorial.doubleValue(n + a); 1349 return (x / y) * (x / z); 1350 } 1351 // p = n! / (n-a)! * n! / (n+a)! 1352 // n * (n-1) * ... * (n-a+1) 1353 // = ----------------------------- 1354 // (n+a) * (n+a-1) * ... * (n+1) 1355 1356 double p = 1; 1357 for (int i = 0; i < a && p != 0; i++) { 1358 p *= (n - i) / (1.0 + n + i); 1359 } 1360 return p; 1361 } 1362 1363 /** 1364 * Computes \(P(D_{n,n}^+ \ge d)\), where \(D_{n,n}^+\) is the 2-sample two-sided 1365 * Kolmogorov-Smirnov statistic. 1366 * 1367 * <p>The returned probability is exact, implemented using the outer method 1368 * presented in Hodges (1958). 1369 * 1370 * @param d Integral D-statistic value (in [1, n]) 1371 * @param n Sample size. 1372 * @return probability that a randomly selected m-n partition of n + n generates \(D_{n,n}\) 1373 * greater than or equal to {@code d} 1374 */ 1375 private static double twoSampleTwoSidedPOutsideSquare(long d, int n) { 1376 // Hodges (1958) Eq. 2.4: 1377 // p = 2 [ binom(2n, n-a) - binom(2n, n-2a) + binom(2n, n-3a) - ... ] / binom(2n, n) 1378 // a in [1, n] == d * n == dnm / n 1379 1380 // As per twoSampleOneSidedPOutsideSquare, divide by binom(2n, n) and each term 1381 // can be expressed as a product: 1382 // ( n - i n - i n - i ) 1383 // p = 2 * ( prod_i=0^a --------- - prod_i=0^2a --------- + prod_i=0^3a --------- + ... ) 1384 // ( 1 + n + i 1 + n + i 1 + n + i ) 1385 // for ja in [1, ..., n/a] 1386 // Avoid repeat computation of terms by extracting common products: 1387 // p = 2 * ( p0a * (1 - p1a * (1 - p2a * (1 - ... ))) ) 1388 // where each term pja is prod_i={ja}^{ja+a} for all j in [1, n / a] 1389 1390 // The first term is the one-sided p. 1391 final double p0a = twoSampleOneSidedPOutsideSquare(d, n); 1392 if (p0a == 0) { 1393 // Underflow - nothing more to do 1394 return 0; 1395 } 1396 // Compute the inner-terms from small to big. 1397 // j = n / (d/n) ~ n*n / d 1398 // j is a measure of how extreme the d value is (small j is extreme d). 1399 // When j is above 0 a path may traverse from the lower boundary to the upper boundary. 1400 final int a = (int) d; 1401 double p = 0; 1402 for (int j = n / a; j > 0; j--) { 1403 double pja = 1; 1404 final int jaa = j * a + a; 1405 // Since p0a did not underflow we avoid the check for pj != 0 1406 for (int i = j * a; i < jaa; i++) { 1407 pja *= (n - i) / (1.0 + n + i); 1408 } 1409 p = pja * (1 - p); 1410 } 1411 p = p0a * (1 - p); 1412 return Math.min(1, 2 * p); 1413 } 1414 1415 /** 1416 * Compute the binomial coefficient binom(n, k). 1417 * 1418 * @param n N. 1419 * @param k K. 1420 * @return binom(n, k) 1421 */ 1422 private static double binom(int n, int k) { 1423 return BinomialCoefficientDouble.value(n, k); 1424 } 1425 1426 /** 1427 * Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\) 1428 * is the 2-sample Kolmogorov-Smirnov statistic. See 1429 * {@link #statistic(double[], double[])} for the definition of \(D_{n,m}\). 1430 * 1431 * <p>Specifically, what is returned is \(1 - CDF(d, \sqrt{mn / (m + n)})\) where CDF 1432 * is the cumulative density function of the two-sided one-sample Kolmogorov-Smirnov 1433 * distribution. 1434 * 1435 * @param d D-statistic value. 1436 * @param n First sample size. 1437 * @param m Second sample size. 1438 * @param twoSided True to compute the two-sided p-value; else one-sided. 1439 * @return approximate probability that a randomly selected m-n partition of m + n generates 1440 * \(D_{n,m}\) greater than {@code d} 1441 */ 1442 static double twoSampleApproximateP(double d, int n, int m, boolean twoSided) { 1443 final double nn = Math.min(n, m); 1444 final double mm = Math.max(n, m); 1445 if (twoSided) { 1446 // Smirnov's asymptotic formula: 1447 // P(sqrt(N) D_n > x) 1448 // N = m*n/(m+n) 1449 return KolmogorovSmirnovDistribution.Two.sf(d, (int) Math.round(mm * nn / (mm + nn))); 1450 } 1451 // one-sided 1452 // Use Hodges Eq 5.3. Requires m >= n 1453 // Correct for m=n, m an integral multiple of n, and 'on the average' for m nearly equal to n 1454 final double z = d * Math.sqrt(nn * mm / (nn + mm)); 1455 return Math.exp(-2 * z * z - 2 * z * (mm + 2 * nn) / Math.sqrt(mm * nn * (mm + nn)) / 3); 1456 } 1457 1458 /** 1459 * Verifies that {@code array} has length at least 2. 1460 * 1461 * @param array Array to test. 1462 * @return the length 1463 * @throws IllegalArgumentException if array is too short 1464 */ 1465 private static int checkArrayLength(double[] array) { 1466 final int n = array.length; 1467 if (n <= 1) { 1468 throw new InferenceException(InferenceException.TWO_VALUES_REQUIRED, n); 1469 } 1470 return n; 1471 } 1472 1473 /** 1474 * Sort the input array. Throws an exception if NaN values are 1475 * present. It is assumed the array is non-zero length. 1476 * 1477 * @param x Input array. 1478 * @param name Name of the array. 1479 * @return a reference to the input (sorted) array 1480 * @throws IllegalArgumentException if {@code x} contains NaN values. 1481 */ 1482 private static double[] sort(double[] x, String name) { 1483 Arrays.sort(x); 1484 // NaN will be at the end 1485 if (Double.isNaN(x[x.length - 1])) { 1486 throw new InferenceException(name + " contains NaN"); 1487 } 1488 return x; 1489 } 1490}