testEnrichment.js - Documentation

const factorials = [1];

// Only exported for testing.
export function lfactorial(x) {
    if (x <= 44) {
        // Computing it exactly for low-ish numbers. The limit of 44! is
        // defined as the largest exact value representable by a double.
        while (factorials.length <= x) {
            let current = factorials.length;
            factorials.push(factorials[current - 1] * current);
        }
        return Math.log(factorials[x]);
    } else {
        // Using Ramanujan's approximation otherwise.
        return 1/6 * Math.log(x * (1 + 4 * x * (1 + 2 * x)) + 1/30) + x * Math.log(x) - x + 0.5 * Math.log(3.14159265358979323846);
    }
}

/**
 * Hypergeometric test for gene set enrichment, based on the overlap between a user-supplied list and the gene set.
 *
 * @param {number} overlap - Number of overlapping genes between the user's list and the gene set, typically obtained from {@linkcode findOverlappingSets}.
 * @param {number} listSize - Size of the user's list.
 * @param {number} setSize - Size of the gene set, see the `size` property from {@linkcode fetchSingleSet}.
 * @param {number} universe - Size of the gene universe (i.e., the total number of genes for this species).
 * This can either be obtained from the arrays in {@linkcode fetchAllGenes} or using {@linkcode effectiveNumberOfGenes}.
 *
 * @return {number} P-value for the enrichment of the user's list in the gene set.
 * This may be NaN if the inputs are inconsistent, e.g., `overlap` is greater than `listSize` or `setSize`.
 */
export function testEnrichment(overlap, listSize, setSize, universeSize) {
    // This code is pretty much paraphrased from R's dhyper.c.
    let notInSet = universeSize - setSize;
    let inSet = setSize;
    let flip = false;

    // If it's impossible, you get a p-value of NaN.
    if (overlap > listSize || overlap > inSet || listSize > universeSize || notInSet < 0) {
        return Number.NaN;
    }

    if (overlap < listSize - notInSet) {
        return Number.NaN;
    }

    if (listSize == universeSize) {
        if (overlap == inSet) {
            return 1;
        } else {
            return Number.NaN;
        }
    }

    if (notInSet == 0) {
        if (overlap == listSize) {
            return 1;
        } else {
            return Number.NaN;
        }
    }

    // We want the upper tail (inclusive), but the code below computes the lower tail (inclusive).
    // So an easy fix is to subtract one from the overlaps, and then we're basically computing
    // lower tail (exclusive), such that 1 minus that gives us upper tail (inclusive).
    if (overlap > 0) {
        overlap--;
    } else {
        // If overlap = 0, upper tail (inclusive) is just 1.
        return 1;
    }

    // Swapping if the probabilities are too high, to avoid excessive
    // cumulative sums and associated inaccuracies as it approaches unity.
    if (inSet > 0 && universeSize > 0 && overlap / inSet > listSize / universeSize) {
        overlap = listSize - overlap - 1;
        let tmp = inSet;
        inSet = notInSet;
        notInSet = tmp;
        flip = true;
    }

    // Computing the cumulative part of the cumulative probability.
    // This is equivalent to the ratio of the cumulative probability
    // with the probability mass of the hypergeometric distribution.
    let counter = overlap;
    let psum = 0;
    let running = 1;
    while (counter > 0 && running > 0) {
        running *= counter * (notInSet - listSize + counter) / (listSize + 1 - counter) / (inSet + 1 - counter);
        psum += running;
        counter--;
    }
    psum += 1;

    // Now computing the scaling part, i.e., the probability mass.
    let dmass = (lfactorial(inSet) - lfactorial(overlap) - lfactorial(inSet - overlap)) + // choose(inSet, overlap)
        (lfactorial(notInSet) - lfactorial(listSize - overlap) - lfactorial(notInSet - listSize + overlap)) - // choose(notInSet, listSize - overlap)
        (lfactorial(universeSize) - lfactorial(listSize) - lfactorial(universeSize - listSize)); // choose(universeSize, listSize)

    let finalp = psum * Math.exp(dmass);
    return (flip ? finalp : 1 - finalp); // remember, we want the upper tail, hence we only subtract if flip = false.
}