85 uint64_t total_sets = 0;
87 auto coll_info = internal::load_ranges_with_sizes(prefix +
"collections.tsv.ranges.gz");
88 internal::check_collection_details(prefix +
"collections.tsv", coll_info.first, coll_info.second);
89 constexpr uint64_t limit = std::numeric_limits<uint64_t>::max();
90 for (
auto x : coll_info.second) {
91 if (limit - total_sets < x) {
92 throw std::runtime_error(
"64-bit unsigned integer overflow for the sum of the number of sets in 'collections.tsv.ranges.gz'");
98 std::vector<uint64_t> set_sizes;
100 auto set_info = internal::load_ranges_with_sizes(prefix +
"sets.tsv.ranges.gz");
101 if (
static_cast<uint64_t
>(set_info.first.size()) != total_sets) {
102 throw std::runtime_error(
"total number of sets in 'sets.tsv' does not match with the reported number from 'collections.tsv.ranges.gz'");
104 set_sizes.swap(set_info.second);
106 std::unordered_map<std::string, std::vector<uint64_t> > token_n, token_d;
107 internal::check_set_details(
111 [&](uint64_t line,
const std::string& name,
const std::string& description) {
112 internal::tokenize(line, name, token_n);
113 internal::tokenize(line, description, token_d);
118 for (
int tt = 0; tt < 2; ++tt) {
119 std::string type = (tt == 0 ?
"names" :
"descriptions");
120 const auto& tokens = (tt == 0 ? token_n : token_d);
122 auto path =
"tokens-" + type +
".tsv";
123 auto ranges_path = path +
".ranges.gz";
124 auto tok_info = internal::load_named_ranges(prefix + ranges_path);
125 internal::check_tokens(tok_info.first, ranges_path);
126 if (tok_info.first.size() != tokens.size()) {
127 throw std::runtime_error(
"different number of tokens from " + type +
" between '" + ranges_path +
"' and 'sets.tsv'");
130 internal::check_indices<false>(
134 [&](uint64_t line,
const std::vector<uint64_t>& indices) {
135 const auto& tok = tok_info.first[line];
136 auto tIt = tokens.find(tok);
137 if (tIt == tokens.end()) {
138 throw std::runtime_error(
"token '" + tok +
"' in '" + ranges_path +
"' is not present in " + type +
" in 'sets.tsv'");
140 if (!internal::same_vectors(tIt->second, indices)) {
141 throw std::runtime_error(
"sets for token '" + tok +
"' in '" + path +
"' are inconsistent with " + type +
" in 'sets.tsv'");
149 std::vector<std::vector<uint64_t> > reverse_map(num_genes);
151 auto s2g_info = internal::load_ranges(prefix +
"set2gene.tsv.ranges.gz");
152 if (s2g_info.size() !=
static_cast<size_t>(total_sets)) {
153 throw std::runtime_error(
"number of lines in 'set2gene.tsv.ranges.gz' does not match the total number of sets");
156 internal::check_indices<true>(
157 prefix +
"set2gene.tsv",
160 [&](uint64_t line,
const std::vector<uint64_t>& indices) {
161 if (
static_cast<uint64_t
>(indices.size()) != set_sizes[line]) {
162 throw std::runtime_error(
"size of set " + std::to_string(line) +
" from 'sets.tsv.ranges.gz' does not match with that in 'set2gene.tsv'");
164 for (
auto i : indices) {
165 reverse_map[i].push_back(line);
173 auto g2s_info = internal::load_ranges(prefix +
"gene2set.tsv.ranges.gz");
174 if (g2s_info.size() !=
static_cast<size_t>(num_genes)) {
175 throw std::runtime_error(
"number of lines in 'gene2set.tsv.ranges.gz' does not match the total number of genes");
178 internal::check_indices<true>(
179 prefix +
"gene2set.tsv",
182 [&](uint64_t line,
const std::vector<uint64_t>& indices) {
183 if (!internal::same_vectors(reverse_map[line], indices)) {
184 throw std::runtime_error(
"sets for gene " + std::to_string(line) +
" in 'gene2set.tsv' are inconsistent with 'set2gene.tsv'");