fetchSetsForSomeGenes.js

import { rangesToBlocks, consolidateRanges } from "./consolidateRanges.js";
import { extractRanges } from "./extractRanges.js";
import { fetchSetsForAllGenes } from "./fetchSetsForAllGenes.js";
import * as utils from "./utils.js";

async function initialize(species, config) {
    let cache;
    if ("fetchSetsForSomeGenes" in config.cache) {
        cache = config.cache.fetchSetsForSomeGenes;
    } else {
        cache = new Map;
        config.cache.fetchSetsForSomeGenes = cache;
    }

    const fname = species + "_gene2set.tsv";
    let spfound = cache.get(species);
    if (typeof spfound == "undefined") {
        const intervals = await utils.retrieveRanges(config, fname);
        spfound = {
            intervals: intervals,
            blocked: rangesToBlocks(intervals, config.consolidateBlockSize),
            prior: new Map
        };
        cache.set(species, spfound);
    }

    return { fname, spfound };
}

/**
 * Count the number of genes in the Gesel database that belong to at least one set.
 *
 * The return value should be used as the total number of balls when performing a hypergeometric test for gene set enrichment,
 * instead of the length of the array returned by {@linkcode fetchAllGenes}.
 * This ensures that uninteresting genes like pseudo-genes or predicted genes are ignored during the calculation.
 * Otherwise, unknown genes would inappropriately increase the number of balls and understate the enrichment p-values.
 *
 * See also the documentation for {@linkcode fetchSetsForSomeGenes} for some comments about caching. 
 *
 * @param {string} species - The taxonomy ID of the species of interest, e.g., `"9606"` for human.
 * @param {object} config - Configuration object, see {@linkcode newConfig}.
 *
 * @return {number} Number of genes that belong to at least one set for `species`.
 * This can be used as a more appropriate universe size in {@linkcode testEnrichment}.
 */
export async function effectiveNumberOfGenes(species, config) {
    if ("fetchSetsForAllGenes" in config.cache) {
        const everything = await fetchSetsForAllGenes(species, config);
        let output = 0;
        for (const s of sets) {
            output += (s.length > 0);
        }
        return output;
    }

    const { spfound } = await initialize(species, config);
    let okay = 0;
    for (var i = 1; i < spfound.intervals.length; i++) {
        if (spfound.intervals[i] > spfound.intervals[i-1] + 1) { // skip the newline.
            okay++;
        }
    }

    return okay;
}

/**
 * Fetch the identities of sets that contain some genes in the Gesel database.
 * This can be more efficient than {@linkcode fetchSetsForAllGenes} if only a few genes are of interest.
 *
 * Every time this function is called, information from the requested `genes` will be added to an in-memory cache.
 * Subsequent calls to this function will re-use as many of the cached genes as possible before making new requests to the Gesel database.
 *
 * If {@linkcode fetchSetsForAllGenes} is called, its cached data will be directly used by `fetchSetsForSomeGenes` to avoid extra requests to the database.
 * If `genes` is large, it may be more efficient to call {@linkcode fetchSetsForAllGenes} to prepare the cache before calling this function.
 *
 * @param {string} species - The taxonomy ID of the species of interest, e.g., `"9606"` for human.
 * @param {Array} genes - Array of gene IDs.
 * Each ID is a row index in any of the arrays returned by {@linkcode fetchAllGenes}.
 * @param {object} config - Configuration object, see {@linkcode newConfig}.
 *
 * @return {Array} Array of length equal to `genes`.
 * Each entry is a Uint32Array containing the IDs for all sets containing to the corresponding gene in `genes`.
 * Set IDs refer to indices in {@linkcode fetchAllSets}.
 * 
 * @async
 */
export async function fetchSetsForSomeGenes(species, genes, config) {
    if ("fetchSetsForAllGenes" in config.cache) {
        const everything = await fetchSetsForAllGenes(species, config);
        let output = [];
        for (const g of genes) {
            output.push(everything[g]);
        }
        return output;
    }

    const { fname, spfound } = await initialize(species, config);

    let needed = utils.setdiff(genes, spfound.prior); 
    if (needed.length > 0) {
        const consolidated = consolidateRanges(spfound.intervals, spfound.blocked, needed);
        const consolidated_parts = await config.fetchRanges(fname, consolidated.start, consolidated.end);

        extractRanges(
            consolidated_parts,
            consolidated.start,
            consolidated.end,
            spfound.intervals,
            consolidated.requested,
            (ii, sliced) => { spfound.prior.set(ii, utils.decodeIndicesFromBuffer(sliced)); }
        );
    }

    let output = [];
    for (const g of genes) {
        output.push(spfound.prior.get(g));
    }
    return output;
}