gesel
Validating Gesel database files
Loading...
Searching...
No Matches
validate_database.hpp
Go to the documentation of this file.
1#ifndef GESEL_VALIDATE_DATABASE_HPP
2#define GESEL_VALIDATE_DATABASE_HPP
3
4#include "check_collection_details.hpp"
5#include "check_indices.hpp"
6#include "check_set_details.hpp"
7#include "load_ranges.hpp"
8
9#include <string>
10#include <cstdint>
11#include <stdexcept>
12#include <vector>
13#include <unordered_map>
14
20namespace gesel {
21
25namespace internal {
26
27inline void tokenize(uint64_t index, const std::string& text, std::unordered_map<std::string, std::vector<uint64_t> >& tokens_to_sets) {
28 std::string latest;
29 auto add = [&]() {
30 if (latest.size()) {
31 auto& vec = tokens_to_sets[latest];
32 if (vec.empty() || vec.back() != index) {
33 vec.push_back(index);
34 }
35 latest.clear();
36 }
37 };
38
39 for (auto x : text) {
40 x = std::tolower(x);
41 if (invalid_token_character(x)) {
42 add();
43 } else {
44 latest += x;
45 }
46 }
47
48 add();
49}
50
51inline void check_tokens(const std::vector<std::string>& tokens, const std::string& path) {
52 for (size_t t = 0, end = tokens.size(); t < end; ++t) {
53 const auto& token = tokens[t];
54 if (token.empty()) {
55 throw std::runtime_error("token should not be an empty string in '" + path + "' " + append_line_number(t));
56 }
57
58 for (auto x : token) {
59 if (invalid_token_character(x)) {
60 throw std::runtime_error("tokens should only contain lower-case alphabetical characters, digits or a dash in '" + path + "' " + append_line_number(t));
61 }
62 }
63
64 if (t && token <= tokens[t - 1]) {
65 throw std::runtime_error("tokens should be unique and lexicographically sorted in '" + path + "' " + append_line_number(t));
66 }
67 }
68}
69
70}
84inline void validate_database(const std::string& prefix, uint64_t num_genes) {
85 uint64_t total_sets = 0;
86 {
87 auto coll_info = internal::load_ranges_with_sizes(prefix + "collections.tsv.ranges.gz");
88 internal::check_collection_details(prefix + "collections.tsv", coll_info.first, coll_info.second);
89 constexpr uint64_t limit = std::numeric_limits<uint64_t>::max();
90 for (auto x : coll_info.second) {
91 if (limit - total_sets < x) {
92 throw std::runtime_error("64-bit unsigned integer overflow for the sum of the number of sets in 'collections.tsv.ranges.gz'");
93 }
94 total_sets += x;
95 }
96 }
97
98 std::vector<uint64_t> set_sizes;
99 {
100 auto set_info = internal::load_ranges_with_sizes(prefix + "sets.tsv.ranges.gz");
101 if (static_cast<uint64_t>(set_info.first.size()) != total_sets) {
102 throw std::runtime_error("total number of sets in 'sets.tsv' does not match with the reported number from 'collections.tsv.ranges.gz'");
103 }
104 set_sizes.swap(set_info.second);
105
106 std::unordered_map<std::string, std::vector<uint64_t> > token_n, token_d;
107 internal::check_set_details(
108 prefix + "sets.tsv",
109 set_info.first,
110 set_sizes,
111 [&](uint64_t line, const std::string& name, const std::string& description) {
112 internal::tokenize(line, name, token_n);
113 internal::tokenize(line, description, token_d);
114 }
115 );
116
117 // Check for correct tokenization.
118 for (int tt = 0; tt < 2; ++tt) {
119 std::string type = (tt == 0 ? "names" : "descriptions");
120 const auto& tokens = (tt == 0 ? token_n : token_d);
121
122 auto path = "tokens-" + type + ".tsv";
123 auto ranges_path = path + ".ranges.gz";
124 auto tok_info = internal::load_named_ranges(prefix + ranges_path);
125 internal::check_tokens(tok_info.first, ranges_path);
126 if (tok_info.first.size() != tokens.size()) {
127 throw std::runtime_error("different number of tokens from " + type + " between '" + ranges_path + "' and 'sets.tsv'");
128 }
129
130 internal::check_indices<false>(
131 prefix + path,
132 total_sets,
133 tok_info.second,
134 [&](uint64_t line, const std::vector<uint64_t>& indices) {
135 const auto& tok = tok_info.first[line];
136 auto tIt = tokens.find(tok);
137 if (tIt == tokens.end()) {
138 throw std::runtime_error("token '" + tok + "' in '" + ranges_path + "' is not present in " + type + " in 'sets.tsv'");
139 }
140 if (!internal::same_vectors(tIt->second, indices)) {
141 throw std::runtime_error("sets for token '" + tok + "' in '" + path + "' are inconsistent with " + type + " in 'sets.tsv'");
142 }
143 }
144 );
145 }
146 }
147
148 // Check for correct mapping of sets to genes.
149 std::vector<std::vector<uint64_t> > reverse_map(num_genes);
150 {
151 auto s2g_info = internal::load_ranges(prefix + "set2gene.tsv.ranges.gz");
152 if (s2g_info.size() != static_cast<size_t>(total_sets)) {
153 throw std::runtime_error("number of lines in 'set2gene.tsv.ranges.gz' does not match the total number of sets");
154 }
155
156 internal::check_indices<true>(
157 prefix + "set2gene.tsv",
158 num_genes,
159 s2g_info,
160 [&](uint64_t line, const std::vector<uint64_t>& indices) {
161 if (static_cast<uint64_t>(indices.size()) != set_sizes[line]) {
162 throw std::runtime_error("size of set " + std::to_string(line) + " from 'sets.tsv.ranges.gz' does not match with that in 'set2gene.tsv'");
163 }
164 for (auto i : indices) {
165 reverse_map[i].push_back(line);
166 }
167 }
168 );
169 }
170
171 // And making sure that the reverse mapping is consistent.
172 {
173 auto g2s_info = internal::load_ranges(prefix + "gene2set.tsv.ranges.gz");
174 if (g2s_info.size() != static_cast<size_t>(num_genes)) {
175 throw std::runtime_error("number of lines in 'gene2set.tsv.ranges.gz' does not match the total number of genes");
176 }
177
178 internal::check_indices<true>(
179 prefix + "gene2set.tsv",
180 total_sets,
181 g2s_info,
182 [&](uint64_t line, const std::vector<uint64_t>& indices) {
183 if (!internal::same_vectors(reverse_map[line], indices)) {
184 throw std::runtime_error("sets for gene " + std::to_string(line) + " in 'gene2set.tsv' are inconsistent with 'set2gene.tsv'");
185 }
186 }
187 );
188 }
189}
190
191}
192
193#endif
Validate Gesel database and gene files.
Definition validate_database.hpp:20
void validate_database(const std::string &prefix, uint64_t num_genes)
Definition validate_database.hpp:84