searchSetText.js

import * as utils from "./utils.js";

export function binarySearch(query, vector) {
    let left = 0;
    let right = vector.length;

    while (left < right) {
        let mid = Math.trunc((left + right) / 2);
        if (vector[mid] < query) {
            left = mid + 1;
        } else if (vector[mid] > query) {
            right = mid;
        } else {
            return mid;
        }
    }

    return left;
}

async function fetchSetsByToken(species, token, file, all_ranges, all_ordered, all_cache) {
    let actual_file = species + "_" + file;

    let cached = all_cache.get(species);
    if (typeof cached === "undefined") {
        const { ranges, order } = await utils.retrieveNamedRanges(actual_file);
        all_ranges.set(species, ranges);
        all_ordered.set(species, order);
        cached = new Map;
        all_cache.set(species, cached);
    }

    if (token == null) {
        return;
    }

    let tfound = cached.get(token);
    if (typeof tfound !== "undefined") {
        return tfound;
    }

    let ranges = all_ranges.get(species);
    let output;
    if (token.includes("*") || token.includes("?")) {
        let ordered = all_ordered.get(species);

        // Wildcard handling.
        let initstub = token.replace(/[*?].*/, "")
        let pos = (initstub == "" ? 0 : binarySearch(initstub, ordered));
        let regex = new RegExp(token.replace(/[*]/g, ".*").replace(/[?]/g, "."));

        let collected = [];
        let to_cache = [];
        let union = new Set;

        while (pos < ordered.length) {
            let candidate = ordered[pos];
            if (initstub != "" && !candidate.startsWith(initstub)) {
                break;
            }

            let cfound = cached.get(candidate);
            if (typeof cfound === "undefined") {
                let rr = ranges.get(candidate);
                collected.push(utils.retrieveBytes(actual_file, rr[0], rr[1]).then(utils.convertToUint32Array));
                to_cache.push(candidate);
            } else {
                for (const y of cfound) {
                    union.add(y);
                }
            }

            pos++;
        }

        let resolved = await Promise.all(collected);
        for (var i = 0; i < resolved.length; i++) {
            let x = resolved[i];
            cached.set(to_cache[i], x);
            for (const y of x) {
                union.add(y);
            }
        }

        output = new Uint32Array(union);

    } else {
        // Direct handling.
        let rr = ranges.get(token);
        if (typeof rr === "undefined") {
            return new Uint32Array;
        }
        let text = await utils.retrieveBytes(actual_file, rr[0], rr[1]);
        output = utils.convertToUint32Array(text);
    }

    cached.set(token, output);
    return output;
}

const n_cache = new Map;
const n_ranges = new Map;
const n_ordered = new Map;

async function fetchSetsByNameToken(species, token) {
    return fetchSetsByToken(species, token, "tokens-names.tsv", n_ranges, n_ordered, n_cache);
}

const d_cache = new Map;
const d_ranges = new Map;
const d_ordered = new Map;

async function fetchSetsByDescriptionToken(species, token) {
    return fetchSetsByToken(species, token, "tokens-descriptions.tsv", d_ranges, d_ordered, d_cache);
}

export async function preloadTokens(species, resp, ordered, cache, msg) {
    if (!resp.ok) {
        throw new Error("failed to fetch full set of " + msg + " tokens for species '" + species + "'");
    }

    let lines = utils.decompressLines(await resp.arrayBuffer());
    if (lines.length !== ordered.length) {
        throw new Error("mismatch in lengths between token names and set indices for species '" + species + "'");
    }

    for (var i = 0; i < lines.length; i++) {
        cache.set(ordered[i], utils.convertToUint32Array(lines[i]));
    }
}

/**
 * @param {string} species - The taxonomy ID of the species of interest, e.g., `"9606"` for human.
 *
 * @return Preloads the search indices for use in {@linkcode searchSetText}.
 * This performs a one-off download of the indices such that further calls to {@linkcode searchSetText} do not need to perform HTTP range requests.
 */
export async function preloadSearchSetText(species) {
    let full = await Promise.all([ 
        utils.reference_download(species + "_tokens-names.tsv.gz"),
        utils.reference_download(species + "_tokens-descriptions.tsv.gz"),
        fetchSetsByNameToken(species, null),
        fetchSetsByDescriptionToken(species, null)
    ]);
    await preloadTokens(species, full[0], n_ordered.get(species), n_cache.get(species), "name");
    await preloadTokens(species, full[1], d_ordered.get(species), d_cache.get(species), "description");
    return;
}

/**
 * @param {string} species - The taxonomy ID of the species of interest, e.g., `"9606"` for human.
 * @param {string} query - Query string containing multiple words to search in the names and/or descriptions of each set.
 *
 * Each stretch of alphanumeric characters and dashes is treated as a single word.
 * All other characters are treated as punctuation between words, except for the following wildcards:
 *
 * - `*`: match zero or more alphanumeric or dash characters.
 * - `?`: match exactly one alphanumeric or dash character.
 *
 * A set's name and/or description must contain all words in `query` to be considered a match.
 * @param {object} [options={}] - Optional parameters.
 * @param {boolean} [options.inName=true] - Whether to search the name of the set for matching words.
 * @param {boolean} [options.inDescription=true] - Whether to search the description of the set for matching words.
 *
 * @return {Array} Array of indices of the sets with names and/or descriptions that match `query`.
 * @async
 */
export async function searchSetText(species, query, { inName = true, inDescription = true } = {}) {
    // Tokenizing the query using the same logic as in the feedstock repository,
    // but preserving our wildcards for special handling later.
    let processed = query.toLowerCase().replace(/[^a-zA-Z0-9-?*]/g, " ");
    let tokens = processed.split(/\s+/);
    tokens = tokens.filter(x => x !== "" || x !== "-");

    let init = [];
    if (inName) {
        init.push(fetchSetsByNameToken(species, null));
    }
    if (inDescription) {
        init.push(fetchSetsByDescriptionToken(species, null));
    }
    await Promise.all(init); // force initialization of all caches.

    let gathered_names = [];
    if (inName) {
        let already_queried = new Set;
        for (const tok of tokens) {
            if (!already_queried.has(tok)) {
                gathered_names.push(fetchSetsByNameToken(species, tok));
                already_queried.add(tok);
            }
        }
    }

    let gathered_descriptions = [];
    if (inDescription) {
        let already_queried = new Set;
        for (const tok of tokens) {
            if (!already_queried.has(tok)) {
                gathered_descriptions.push(fetchSetsByDescriptionToken(species, tok));
                already_queried.add(tok);
            }
        }
    }

    let resolved_names = await Promise.all(gathered_names);
    let resolved_descriptions = await Promise.all(gathered_descriptions);

    let gathered = [];
    for (var i = 0; i < tokens.length; i++) {
        let n = (inName ? resolved_names[i] : []);
        let d = (inDescription ? resolved_descriptions[i] : []);

        let combined = new Uint32Array(n.length + d.length);
        combined.set(n);
        combined.set(d, n.length);
        gathered.push(combined);
    }

    return utils.intersect(gathered);
}