import * as utils from "./utils.js";
import { rangesToBlocks, consolidateRanges } from "./consolidateRanges.js";
import { extractRanges } from "./extractRanges.js";
export function binarySearchRight(query, vector) {
let left = 0;
let right = vector.length;
while (left < right) {
let mid = Math.trunc((left + right) / 2);
if (vector[mid] < query) {
left = mid + 1;
} else if (vector[mid] > query) {
right = mid;
} else {
return mid;
}
}
return left;
}
async function fetchSetsByToken(species, tokens, type, config) {
let cache;
if ("fetchSetsByToken" in config.cache) {
cache = config.cache.fetchSetsByToken;
} else {
cache = new Map;
config.cache.fetchSetsByToken = cache;
}
const fname = species + "_tokens-" + type + ".tsv";
let spfound = cache.get(species);
if (typeof cached === "undefined") {
const { ranges, order } = await utils.retrieveNamedRanges(config, fname);
spfound = {
intervals: ranges,
order: order,
blocked: rangesToBlocks(ranges, config.consolidateBlockSize),
prior: new Map
};
cache.set(species, spfound);
}
const partial_mapping = new Map;
const to_request = new Set;
for (const token of tokens) {
if (spfound.prior.has(token)) {
continue;
}
if (token.includes("*") || token.includes("?")) {
// Wildcard handling.
let initstub = token.replace(/[*?].*/, "")
let pos = (initstub == "" ? 0 : binarySearchRight(initstub, spfound.order));
let regex = new RegExp("^" + token.replace(/[*]/g, ".*").replace(/[?]/g, ".") + "$");
let partial_idx = [];
while (pos < spfound.order.length) {
let candidate = spfound.order[pos];
if (initstub != "" && !candidate.startsWith(initstub)) {
break;
}
if (candidate.match(regex)) {
to_request.add(pos);
partial_idx.push(pos);
}
pos++;
}
partial_mapping.set(token, partial_idx);
} else {
// Direct handling.
const pos = binarySearchRight(token, spfound.order);
if (pos < spfound.order.length && spfound.order[pos] == token) {
to_request.add(pos);
} else {
spfound.prior.set(token, []);
}
}
}
if (to_request.size > 0) {
const requested_idx = Array.from(to_request);
const consolidated = consolidateRanges(spfound.intervals, spfound.blocked, requested_idx);
const consolidated_parts = await config.fetchRanges(fname, consolidated.start, consolidated.end);
extractRanges(
consolidated_parts,
consolidated.start,
consolidated.end,
spfound.intervals,
consolidated.requested,
(ii, sliced) => { spfound.prior.set(spfound.order[ii], utils.decodeIndicesFromBuffer(sliced)); }
);
}
// Populating the cache for the wildcard-containing tokens.
for (const [partial, concretes] of partial_mapping.entries()) {
let collected = new Set;
for (const iv of concretes) {
for (const m of spfound.prior.get(spfound.order[iv])) {
collected.add(m);
}
}
const collected_idx = Array.from(collected);
collected_idx.sort((a, b) => a - b);
spfound.prior.set(partial, collected_idx);
}
let output = [];
for (const tok of tokens) {
output.push(spfound.prior.get(tok));
}
return output;
}
/**
* @param {string} species - The taxonomy ID of the species of interest, e.g., `"9606"` for human.
* @param {string} query - Query string containing multiple words to search in the names and/or descriptions of each set.
*
* Each stretch of alphanumeric characters and dashes is treated as a single word.
* All other characters are treated as punctuation between words, except for the following wildcards:
*
* - `*`: match zero or more alphanumeric or dash characters.
* - `?`: match exactly one alphanumeric or dash character.
*
* A set's name and/or description must contain all words in `query` to be considered a match.
* @param {object} config - Configuration object, see {@linkcode newConfig}.
* @param {object} [options={}] - Optional parameters.
* @param {boolean} [options.inName=true] - Whether to search the name of the set for matching words.
* @param {boolean} [options.inDescription=true] - Whether to search the description of the set for matching words.
*
* @return {Array} Array of indices of the sets with names and/or descriptions that match `query`.
* @async
*/
export async function searchSetText(species, query, config, { inName = true, inDescription = true } = {}) {
// Tokenizing the query using the same logic as in the feedstock repository,
// but preserving our wildcards for special handling later.
let processed = query.toLowerCase().replace(/[^a-zA-Z0-9-?*]/g, " ");
let tokens = processed.split(/\s+/);
tokens = tokens.filter(x => x !== "" || x !== "-");
tokens = Array.from(new Set(tokens));
let gathered = [];
if (inName && inDescription) {
const resolved = await Promise.all([
fetchSetsByToken(species, tokens, "names", config),
fetchSetsByToken(species, tokens, "descriptions", config)
]);
const gathered_names = resolved[0];
const gathered_descriptions = resolved[1];
for (var t = 0; t < tokens.length; t++) {
const combined = [...gathered_names[t], ...gathered_descriptions[t]];
gathered.push(Array.from(new Set(combined)));
}
} else if (inName) {
gathered = await fetchSetsByToken(species, tokens, "names", config);
} else if (inDescription) {
gathered = await fetchSetsByToken(species, tokens, "descriptions", config);
}
return utils.intersect(gathered);
}