btllib
nthash.hpp
1 /*
2  * nthash.hpp
3  * Author: Hamid Mohamadi
4  * Genome Sciences Centre,
5  * British Columbia Cancer Agency
6  */
7 
8 #ifndef BTLLIB_NTHASH_HPP
9 #define BTLLIB_NTHASH_HPP
10 
11 #include <cstdint>
12 #include <limits>
13 #include <string>
14 #include <vector>
15 
16 namespace btllib {
17 
18 // offset for the complement base in the random seeds table
19 const uint8_t cp_off = 0x07;
20 
21 // shift for gerenerating multiple hash values
22 const int multishift = 27;
23 
24 // seed for gerenerating multiple hash values
25 static const uint64_t multiseed = 0x90b45d39fb6da1fa;
26 
27 // 64-bit random seeds corresponding to bases and their complements
28 static const uint64_t seedA = 0x3c8bfbb395c60474;
29 static const uint64_t seedC = 0x3193c18562a02b4c;
30 static const uint64_t seedG = 0x20323ed082572324;
31 static const uint64_t seedT = 0x295549f54be24456;
32 static const uint64_t seedN = 0x0000000000000000;
33 
34 static const int ASCII_SIZE = 256;
35 
36 static const uint64_t seed_tab[ASCII_SIZE] = {
37  seedN, seedT, seedN, seedG, seedA, seedA, seedN, seedC, // 0..7
38  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 8..15
39  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 16..23
40  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 24..31
41  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 32..39
42  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 40..47
43  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 48..55
44  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 56..63
45  seedN, seedA, seedN, seedC, seedN, seedN, seedN, seedG, // 64..71
46  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 72..79
47  seedN, seedN, seedN, seedN, seedT, seedT, seedN, seedN, // 80..87
48  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 88..95
49  seedN, seedA, seedN, seedC, seedN, seedN, seedN, seedG, // 96..103
50  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 104..111
51  seedN, seedN, seedN, seedN, seedT, seedT, seedN, seedN, // 112..119
52  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 120..127
53  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 128..135
54  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 136..143
55  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 144..151
56  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 152..159
57  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 160..167
58  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 168..175
59  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 176..183
60  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 184..191
61  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 192..199
62  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 200..207
63  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 208..215
64  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 216..223
65  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 224..231
66  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 232..239
67  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 240..247
68  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN // 248..255
69 };
70 
71 static const uint64_t A33r[33] = {
72  0x195c60474, 0x12b8c08e9, 0x571811d3, 0xae3023a6, 0x15c60474c, 0xb8c08e99,
73  0x171811d32, 0xe3023a65, 0x1c60474ca, 0x18c08e995, 0x11811d32b, 0x3023a657,
74  0x60474cae, 0xc08e995c, 0x1811d32b8, 0x1023a6571, 0x474cae3, 0x8e995c6,
75  0x11d32b8c, 0x23a65718, 0x474cae30, 0x8e995c60, 0x11d32b8c0, 0x3a657181,
76  0x74cae302, 0xe995c604, 0x1d32b8c08, 0x1a6571811, 0x14cae3023, 0x995c6047,
77  0x132b8c08e, 0x6571811d, 0xcae3023a
78 };
79 
80 static const uint64_t A31l[31] = {
81  0x3c8bfbb200000000, 0x7917f76400000000, 0xf22feec800000000,
82  0xe45fdd9200000000, 0xc8bfbb2600000000, 0x917f764e00000000,
83  0x22feec9e00000000, 0x45fdd93c00000000, 0x8bfbb27800000000,
84  0x17f764f200000000, 0x2feec9e400000000, 0x5fdd93c800000000,
85  0xbfbb279000000000, 0x7f764f2200000000, 0xfeec9e4400000000,
86  0xfdd93c8a00000000, 0xfbb2791600000000, 0xf764f22e00000000,
87  0xeec9e45e00000000, 0xdd93c8be00000000, 0xbb27917e00000000,
88  0x764f22fe00000000, 0xec9e45fc00000000, 0xd93c8bfa00000000,
89  0xb27917f600000000, 0x64f22fee00000000, 0xc9e45fdc00000000,
90  0x93c8bfba00000000, 0x27917f7600000000, 0x4f22feec00000000,
91  0x9e45fdd800000000
92 };
93 
94 static const uint64_t C33r[33] = {
95  0x162a02b4c, 0xc5405699, 0x18a80ad32, 0x115015a65, 0x2a02b4cb, 0x54056996,
96  0xa80ad32c, 0x15015a658, 0xa02b4cb1, 0x140569962, 0x80ad32c5, 0x1015a658a,
97  0x2b4cb15, 0x569962a, 0xad32c54, 0x15a658a8, 0x2b4cb150, 0x569962a0,
98  0xad32c540, 0x15a658a80, 0xb4cb1501, 0x169962a02, 0xd32c5405, 0x1a658a80a,
99  0x14cb15015, 0x9962a02b, 0x132c54056, 0x658a80ad, 0xcb15015a, 0x1962a02b4,
100  0x12c540569, 0x58a80ad3, 0xb15015a6
101 };
102 
103 static const uint64_t C31l[31] = {
104  0x3193c18400000000, 0x6327830800000000, 0xc64f061000000000,
105  0x8c9e0c2200000000, 0x193c184600000000, 0x3278308c00000000,
106  0x64f0611800000000, 0xc9e0c23000000000, 0x93c1846200000000,
107  0x278308c600000000, 0x4f06118c00000000, 0x9e0c231800000000,
108  0x3c18463200000000, 0x78308c6400000000, 0xf06118c800000000,
109  0xe0c2319200000000, 0xc184632600000000, 0x8308c64e00000000,
110  0x6118c9e00000000, 0xc23193c00000000, 0x1846327800000000,
111  0x308c64f000000000, 0x6118c9e000000000, 0xc23193c000000000,
112  0x8463278200000000, 0x8c64f0600000000, 0x118c9e0c00000000,
113  0x23193c1800000000, 0x4632783000000000, 0x8c64f06000000000,
114  0x18c9e0c200000000
115 };
116 
117 static const uint64_t G33r[33] = {
118  0x82572324, 0x104ae4648, 0x95c8c91, 0x12b91922, 0x25723244, 0x4ae46488,
119  0x95c8c910, 0x12b919220, 0x57232441, 0xae464882, 0x15c8c9104, 0xb9192209,
120  0x172324412, 0xe4648825, 0x1c8c9104a, 0x191922095, 0x12324412b, 0x46488257,
121  0x8c9104ae, 0x11922095c, 0x324412b9, 0x64882572, 0xc9104ae4, 0x1922095c8,
122  0x124412b91, 0x48825723, 0x9104ae46, 0x122095c8c, 0x4412b919, 0x88257232,
123  0x1104ae464, 0x2095c8c9, 0x412b9192
124 };
125 
126 static const uint64_t G31l[31] = {
127  0x20323ed000000000, 0x40647da000000000, 0x80c8fb4000000000,
128  0x191f68200000000, 0x323ed0400000000, 0x647da0800000000,
129  0xc8fb41000000000, 0x191f682000000000, 0x323ed04000000000,
130  0x647da08000000000, 0xc8fb410000000000, 0x91f6820200000000,
131  0x23ed040600000000, 0x47da080c00000000, 0x8fb4101800000000,
132  0x1f68203200000000, 0x3ed0406400000000, 0x7da080c800000000,
133  0xfb41019000000000, 0xf682032200000000, 0xed04064600000000,
134  0xda080c8e00000000, 0xb410191e00000000, 0x6820323e00000000,
135  0xd040647c00000000, 0xa080c8fa00000000, 0x410191f600000000,
136  0x820323ec00000000, 0x40647da00000000, 0x80c8fb400000000,
137  0x10191f6800000000
138 };
139 
140 static const uint64_t T33r[33] = {
141  0x14be24456, 0x97c488ad, 0x12f89115a, 0x5f1222b5, 0xbe24456a, 0x17c488ad4,
142  0xf89115a9, 0x1f1222b52, 0x1e24456a5, 0x1c488ad4b, 0x189115a97, 0x11222b52f,
143  0x24456a5f, 0x488ad4be, 0x9115a97c, 0x1222b52f8, 0x4456a5f1, 0x88ad4be2,
144  0x1115a97c4, 0x22b52f89, 0x456a5f12, 0x8ad4be24, 0x115a97c48, 0x2b52f891,
145  0x56a5f122, 0xad4be244, 0x15a97c488, 0xb52f8911, 0x16a5f1222, 0xd4be2445,
146  0x1a97c488a, 0x152f89115, 0xa5f1222b
147 };
148 
149 static const uint64_t T31l[31] = {
150  0x295549f400000000, 0x52aa93e800000000, 0xa55527d000000000,
151  0x4aaa4fa200000000, 0x95549f4400000000, 0x2aa93e8a00000000,
152  0x55527d1400000000, 0xaaa4fa2800000000, 0x5549f45200000000,
153  0xaa93e8a400000000, 0x5527d14a00000000, 0xaa4fa29400000000,
154  0x549f452a00000000, 0xa93e8a5400000000, 0x527d14aa00000000,
155  0xa4fa295400000000, 0x49f452aa00000000, 0x93e8a55400000000,
156  0x27d14aaa00000000, 0x4fa2955400000000, 0x9f452aa800000000,
157  0x3e8a555200000000, 0x7d14aaa400000000, 0xfa29554800000000,
158  0xf452aa9200000000, 0xe8a5552600000000, 0xd14aaa4e00000000,
159  0xa295549e00000000, 0x452aa93e00000000, 0x8a55527c00000000,
160  0x14aaa4fa00000000
161 };
162 
163 static const uint64_t N33r[33] = {
164  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN,
165  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN,
166  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN
167 };
168 
169 static const uint64_t N31l[31] = {
170  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN,
171  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN,
172  seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN
173 };
174 
175 static const uint64_t* ms_tab_33r[ASCII_SIZE] = {
176  N33r, T33r, N33r, G33r, A33r, A33r, N33r, C33r, // 0..7
177  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 8..15
178  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 16..23
179  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 24..31
180  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 32..39
181  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 40..47
182  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 48..55
183  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 56..63
184  N33r, A33r, N33r, C33r, N33r, N33r, N33r, G33r, // 64..71
185  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 72..79
186  N33r, N33r, N33r, N33r, T33r, T33r, N33r, N33r, // 80..87
187  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 88..95
188  N33r, A33r, N33r, C33r, N33r, N33r, N33r, G33r, // 96..103
189  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 104..111
190  N33r, N33r, N33r, N33r, T33r, T33r, N33r, N33r, // 112..119
191  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 120..127
192  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 128..135
193  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 136..143
194  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 144..151
195  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 152..159
196  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 160..167
197  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 168..175
198  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 176..183
199  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 184..191
200  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 192..199
201  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 200..207
202  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 208..215
203  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 216..223
204  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 224..231
205  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 232..239
206  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r, // 240..247
207  N33r, N33r, N33r, N33r, N33r, N33r, N33r, N33r // 248..255
208 };
209 
210 static const uint64_t* ms_tab_31l[ASCII_SIZE] = {
211  N31l, T31l, N31l, G31l, A31l, A31l, N31l, C31l, // 0..7
212  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 8..15
213  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 16..23
214  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 24..31
215  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 32..39
216  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 40..47
217  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 48..55
218  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 56..63
219  N31l, A31l, N31l, C31l, N31l, N31l, N31l, G31l, // 64..71
220  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 72..79
221  N31l, N31l, N31l, N31l, T31l, T31l, N31l, N31l, // 80..87
222  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 88..95
223  N31l, A31l, N31l, C31l, N31l, N31l, N31l, G31l, // 96..103
224  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 104..111
225  N31l, N31l, N31l, N31l, T31l, T31l, N31l, N31l, // 112..119
226  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 120..127
227  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 128..135
228  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 136..143
229  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 144..151
230  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 152..159
231  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 160..167
232  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 168..175
233  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 176..183
234  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 184..191
235  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 192..199
236  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 200..207
237  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 208..215
238  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 216..223
239  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 224..231
240  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 232..239
241  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l, // 240..247
242  N31l, N31l, N31l, N31l, N31l, N31l, N31l, N31l // 248..255
243 };
244 
245 // rotate "v" to the left 1 position
246 inline uint64_t
247 rol1(const uint64_t v)
248 {
249  return (v << 1) | (v >> 63); // NOLINT
250 }
251 
252 // rotate "v" to the right by 1 position
253 inline uint64_t
254 ror1(const uint64_t v)
255 {
256  return (v >> 1) | (v << 63); // NOLINT
257 }
258 
259 // rotate 31-left bits of "v" to the left by "s" positions
260 inline uint64_t
261 rol31(const uint64_t v, unsigned s)
262 {
263  s %= 31; // NOLINT
264  return ((v << s) | (v >> (31 - s))) & 0x7FFFFFFF; // NOLINT
265 }
266 
267 // rotate 33-right bits of "v" to the left by "s" positions
268 inline uint64_t
269 rol33(const uint64_t v, unsigned s)
270 {
271  s %= 33; // NOLINT
272  return ((v << s) | (v >> (33 - s))) & 0x1FFFFFFFF; // NOLINT
273 }
274 
275 // swap bit 0 with bit 33 in "v"
276 inline uint64_t
277 swapbits033(const uint64_t v)
278 {
279  uint64_t x = (v ^ (v >> 33)) & 1; // NOLINT
280  return v ^ (x | (x << 33)); // NOLINT
281 }
282 
283 // swap bit 32 with bit 63 in "v"
284 inline uint64_t
285 swapbits3263(const uint64_t v)
286 {
287  uint64_t x = ((v >> 32) ^ (v >> 63)) & 1; // NOLINT
288  return v ^ ((x << 32) | (x << 63)); // NOLINT
289 }
290 
291 // forward-strand hash value of the base kmer, i.e. fhval(kmer_0)
292 inline uint64_t
293 NTF64(const char* kmer_seq, const unsigned k)
294 {
295  uint64_t h_val = 0;
296  for (unsigned i = 0; i < k; i++) {
297  h_val = rol1(h_val);
298  h_val = swapbits033(h_val);
299  h_val ^= seed_tab[(unsigned char)kmer_seq[i]];
300  }
301  return h_val;
302 }
303 
304 // reverse-strand hash value of the base kmer, i.e. rhval(kmer_0)
305 inline uint64_t
306 NTR64(const char* kmer_seq, const unsigned k)
307 {
308  uint64_t h_val = 0;
309  for (unsigned i = 0; i < k; i++) {
310  h_val = rol1(h_val);
311  h_val = swapbits033(h_val);
312  h_val ^= seed_tab[(unsigned char)kmer_seq[k - 1 - i] & cp_off];
313  }
314  return h_val;
315 }
316 
317 // forward-strand ntHash for sliding k-mers
318 inline uint64_t
319 NTF64(const uint64_t fh_val,
320  const unsigned k,
321  const unsigned char char_out,
322  const unsigned char char_in)
323 {
324  uint64_t h_val = rol1(fh_val);
325  h_val = swapbits033(h_val);
326  h_val ^= seed_tab[char_in];
327  h_val ^=
328  (ms_tab_31l[char_out][k % 31] | ms_tab_33r[char_out][k % 33]); // NOLINT
329  return h_val;
330 }
331 
332 // reverse-complement ntHash for sliding k-mers
333 inline uint64_t
334 NTR64(const uint64_t rh_val,
335  const unsigned k,
336  const unsigned char char_out,
337  const unsigned char char_in)
338 {
339  uint64_t h_val = rh_val ^ (ms_tab_31l[char_in & cp_off][k % 31] | // NOLINT
340  ms_tab_33r[char_in & cp_off][k % 33]); // NOLINT
341  h_val ^= seed_tab[char_out & cp_off];
342  h_val = ror1(h_val);
343  h_val = swapbits3263(h_val);
344  return h_val;
345 }
346 
347 // canonical ntBase
348 inline uint64_t
349 NTC64(const char* kmer_seq, const unsigned k)
350 {
351  uint64_t fh_val = 0, rh_val = 0;
352  fh_val = NTF64(kmer_seq, k);
353  rh_val = NTR64(kmer_seq, k);
354  return (rh_val < fh_val) ? rh_val : fh_val;
355 }
356 
357 // canonical ntHash
358 inline uint64_t
359 NTC64(const char* kmer_seq,
360  const unsigned k,
361  uint64_t& fh_val,
362  uint64_t& rh_val)
363 {
364  fh_val = NTF64(kmer_seq, k);
365  rh_val = NTR64(kmer_seq, k);
366  return (rh_val < fh_val) ? rh_val : fh_val;
367 }
368 
369 // canonical ntHash for sliding k-mers
370 inline uint64_t
371 NTC64(const unsigned char char_out,
372  const unsigned char char_in,
373  const unsigned k,
374  uint64_t& fh_val,
375  uint64_t& rh_val)
376 {
377  fh_val = NTF64(fh_val, k, char_out, char_in);
378  rh_val = NTR64(rh_val, k, char_out, char_in);
379  return (rh_val < fh_val) ? rh_val : fh_val;
380 }
381 
382 // forward-strand ntHash for sliding k-mers to the left
383 inline uint64_t
384 NTF64L(const uint64_t rh_val,
385  const unsigned k,
386  const unsigned char char_out,
387  const unsigned char char_in)
388 {
389  uint64_t h_val = rh_val ^ (ms_tab_31l[char_in][k % 31] | // NOLINT
390  ms_tab_33r[char_in][k % 33]); // NOLINT
391  h_val ^= seed_tab[char_out];
392  h_val = ror1(h_val);
393  h_val = swapbits3263(h_val);
394  return h_val;
395 }
396 
397 // reverse-complement ntHash for sliding k-mers to the left
398 inline uint64_t
399 NTR64L(const uint64_t fh_val,
400  const unsigned k,
401  const unsigned char char_out,
402  const unsigned char char_in)
403 {
404  uint64_t h_val = rol1(fh_val);
405  h_val = swapbits033(h_val);
406  h_val ^= seed_tab[char_in & cp_off];
407  h_val ^= (ms_tab_31l[char_out & cp_off][k % 31] | // NOLINT
408  ms_tab_33r[char_out & cp_off][k % 33]); // NOLINT
409  return h_val;
410 }
411 
412 // canonical ntHash for sliding k-mers to the left
413 inline uint64_t
414 NTC64L(const unsigned char char_out,
415  const unsigned char char_in,
416  const unsigned k,
417  uint64_t& fh_val,
418  uint64_t& rh_val)
419 {
420  fh_val = NTF64L(fh_val, k, char_out, char_in);
421  rh_val = NTR64L(rh_val, k, char_out, char_in);
422  return (rh_val < fh_val) ? rh_val : fh_val;
423 }
424 
425 // ntBase with seeding option
426 inline uint64_t
427 NTF64(const char* kmer_seq, const unsigned k, const unsigned seed)
428 {
429  uint64_t h_val = NTF64(kmer_seq, k);
430  if (seed == 0) {
431  return h_val;
432  }
433  h_val *= seed ^ k * multiseed;
434  h_val ^= h_val >> multishift;
435  return h_val;
436 }
437 
438 // canonical ntBase with seeding option
439 inline uint64_t
440 NTC64(const char* kmer_seq, const unsigned k, const unsigned seed)
441 {
442  uint64_t h_val = NTC64(kmer_seq, k);
443  if (seed == 0) {
444  return h_val;
445  }
446  h_val *= seed ^ k * multiseed;
447  h_val ^= h_val >> multishift;
448  return h_val;
449 }
450 
451 // multihash ntHash, ntBase
452 inline void
453 NTM64(const char* kmer_seq, const unsigned k, const unsigned m, uint64_t* h_val)
454 {
455  uint64_t b_val = 0, t_val = 0;
456  b_val = NTF64(kmer_seq, k);
457  h_val[0] = b_val;
458  for (unsigned i = 1; i < m; i++) {
459  t_val = b_val * (i ^ k * multiseed);
460  t_val ^= t_val >> multishift;
461  h_val[i] = t_val;
462  }
463 }
464 
465 // one extra hash for given base hash
466 inline uint64_t
467 NTE64(const uint64_t h_val, const unsigned k, const unsigned i)
468 {
469  uint64_t t_val = h_val;
470  t_val *= (i ^ k * multiseed);
471  t_val ^= t_val >> multishift;
472  return t_val;
473 }
474 
475 // multihash ntHash for sliding k-mers
476 inline void
477 NTM64(const unsigned char char_out,
478  const unsigned char char_in,
479  const unsigned k,
480  const unsigned m,
481  uint64_t* h_val)
482 {
483  uint64_t b_val = 0, t_val = 0;
484  b_val = NTF64(h_val[0], k, char_out, char_in);
485  h_val[0] = b_val;
486  for (unsigned i = 1; i < m; i++) {
487  t_val = b_val * (i ^ k * multiseed);
488  t_val ^= t_val >> multishift;
489  h_val[i] = t_val;
490  }
491 }
492 
493 // canonical multihash ntBase
494 inline void
495 NTMC64(const char* kmer_seq,
496  const unsigned k,
497  const unsigned m,
498  uint64_t* h_val)
499 {
500  uint64_t b_val = 0, t_val = 0;
501  b_val = NTC64(kmer_seq, k);
502  h_val[0] = b_val;
503  for (unsigned i = 1; i < m; i++) {
504  t_val = b_val * (i ^ k * multiseed);
505  t_val ^= t_val >> multishift;
506  h_val[i] = t_val;
507  }
508 }
509 
510 // canonical multihash ntHash
511 inline void
512 NTMC64(const char* kmer_seq,
513  const unsigned k,
514  const unsigned m,
515  uint64_t& fh_val,
516  uint64_t& rh_val,
517  uint64_t* h_val)
518 {
519  uint64_t b_val = 0, t_val = 0;
520  b_val = NTC64(kmer_seq, k, fh_val, rh_val);
521  h_val[0] = b_val;
522  for (unsigned i = 1; i < m; i++) {
523  t_val = b_val * (i ^ k * multiseed);
524  t_val ^= t_val >> multishift;
525  h_val[i] = t_val;
526  }
527 }
528 
529 // canonical multihash ntHash for sliding k-mers
530 inline void
531 NTMC64(const unsigned char char_out,
532  const unsigned char char_in,
533  const unsigned k,
534  const unsigned m,
535  uint64_t& fh_val,
536  uint64_t& rh_val,
537  uint64_t* h_val)
538 {
539  uint64_t b_val = 0, t_val = 0;
540  b_val = NTC64(char_out, char_in, k, fh_val, rh_val);
541  h_val[0] = b_val;
542  for (unsigned i = 1; i < m; i++) {
543  t_val = b_val * (i ^ k * multiseed);
544  t_val ^= t_val >> multishift;
545  h_val[i] = t_val;
546  }
547 }
548 
549 /*
550  * ignoring k-mers containing nonACGT using ntHash function
551  */
552 
553 // canonical ntBase
554 inline bool
555 NTC64(const char* kmer_seq, const unsigned k, uint64_t& h_val, unsigned& locN)
556 {
557  h_val = 0;
558  locN = 0;
559  uint64_t fh_val = 0, rh_val = 0;
560  for (int i = int(k - 1); i >= 0; i--) {
561  if (seed_tab[(unsigned char)kmer_seq[i]] == seedN) {
562  locN = i;
563  return false;
564  }
565  fh_val = rol1(fh_val);
566  fh_val = swapbits033(fh_val);
567  fh_val ^= seed_tab[(unsigned char)kmer_seq[k - 1 - i]];
568 
569  rh_val = rol1(rh_val);
570  rh_val = swapbits033(rh_val);
571  rh_val ^= seed_tab[(unsigned char)kmer_seq[i] & cp_off];
572  }
573  h_val = (rh_val < fh_val) ? rh_val : fh_val;
574  return true;
575 }
576 
577 // canonical multihash ntBase
578 inline bool
579 NTMC64(const char* kmer_seq,
580  const unsigned k,
581  const unsigned m,
582  unsigned& locN,
583  uint64_t* h_val)
584 {
585  uint64_t b_val = 0, t_val = 0, fh_val = 0, rh_val = 0;
586  locN = 0;
587  for (int i = int(k - 1); i >= 0; i--) {
588  if (seed_tab[(unsigned char)kmer_seq[i]] == seedN) {
589  locN = i;
590  return false;
591  }
592  fh_val = rol1(fh_val);
593  fh_val = swapbits033(fh_val);
594  fh_val ^= seed_tab[(unsigned char)kmer_seq[k - 1 - i]];
595 
596  rh_val = rol1(rh_val);
597  rh_val = swapbits033(rh_val);
598  rh_val ^= seed_tab[(unsigned char)kmer_seq[i] & cp_off];
599  }
600  b_val = (rh_val < fh_val) ? rh_val : fh_val;
601  h_val[0] = b_val;
602  for (unsigned i = 1; i < m; i++) {
603  t_val = b_val * (i ^ k * multiseed);
604  t_val ^= t_val >> multishift;
605  h_val[i] = t_val;
606  }
607  return true;
608 }
609 
610 // canonical ntHash
611 inline bool
612 NTC64(const char* kmer_seq,
613  const unsigned k,
614  uint64_t& fh_val,
615  uint64_t& rh_val,
616  uint64_t& h_val,
617  unsigned& locN)
618 {
619  h_val = fh_val = rh_val = 0;
620  locN = 0;
621  for (int i = int(k - 1); i >= 0; i--) {
622  if (seed_tab[(unsigned char)kmer_seq[i]] == seedN) {
623  locN = i;
624  return false;
625  }
626  fh_val = rol1(fh_val);
627  fh_val = swapbits033(fh_val);
628  fh_val ^= seed_tab[(unsigned char)kmer_seq[k - 1 - i]];
629 
630  rh_val = rol1(rh_val);
631  rh_val = swapbits033(rh_val);
632  rh_val ^= seed_tab[(unsigned char)kmer_seq[i] & cp_off];
633  }
634  h_val = (rh_val < fh_val) ? rh_val : fh_val;
635  return true;
636 }
637 
638 // canonical multihash ntHash
639 inline bool
640 NTMC64(const char* kmer_seq,
641  const unsigned k,
642  const unsigned m,
643  uint64_t& fh_val,
644  uint64_t& rh_val,
645  unsigned& locN,
646  uint64_t* h_val)
647 {
648  fh_val = rh_val = 0;
649  uint64_t b_val = 0, t_val = 0;
650  locN = 0;
651  for (int i = int(k - 1); i >= 0; i--) {
652  if (seed_tab[(unsigned char)kmer_seq[i]] == seedN) {
653  locN = i;
654  return false;
655  }
656  fh_val = rol1(fh_val);
657  fh_val = swapbits033(fh_val);
658  fh_val ^= seed_tab[(unsigned char)kmer_seq[k - 1 - i]];
659 
660  rh_val = rol1(rh_val);
661  rh_val = swapbits033(rh_val);
662  rh_val ^= seed_tab[(unsigned char)kmer_seq[i] & cp_off];
663  }
664  b_val = (rh_val < fh_val) ? rh_val : fh_val;
665  h_val[0] = b_val;
666  for (unsigned i = 1; i < m; i++) {
667  t_val = b_val * (i ^ k * multiseed);
668  t_val ^= t_val >> multishift;
669  h_val[i] = t_val;
670  }
671  return true;
672 }
673 
674 // strand-aware canonical multihash ntHash
675 inline bool
676 NTMC64(const char* kmer_seq,
677  const unsigned k,
678  const unsigned m,
679  uint64_t& fh_val,
680  uint64_t& rh_val,
681  unsigned& locN,
682  uint64_t* h_val,
683  bool& hStn)
684 {
685  fh_val = rh_val = 0;
686  uint64_t b_val = 0, t_val = 0;
687  locN = 0;
688  for (int i = int(k - 1); i >= 0; i--) {
689  if (seed_tab[(unsigned char)kmer_seq[i]] == seedN) {
690  locN = i;
691  return false;
692  }
693  fh_val = rol1(fh_val);
694  fh_val = swapbits033(fh_val);
695  fh_val ^= seed_tab[(unsigned char)kmer_seq[k - 1 - i]];
696 
697  rh_val = rol1(rh_val);
698  rh_val = swapbits033(rh_val);
699  rh_val ^= seed_tab[(unsigned char)kmer_seq[i] & cp_off];
700  }
701  hStn = rh_val < fh_val;
702  b_val = hStn ? rh_val : fh_val;
703  h_val[0] = b_val;
704  for (unsigned i = 1; i < m; i++) {
705  t_val = b_val * (i ^ k * multiseed);
706  t_val ^= t_val >> multishift;
707  h_val[i] = t_val;
708  }
709  return true;
710 }
711 
712 // starnd-aware canonical multihash ntHash for sliding k-mers
713 inline void
714 NTMC64(const unsigned char char_out,
715  const unsigned char char_in,
716  const unsigned k,
717  const unsigned m,
718  uint64_t& fh_val,
719  uint64_t& rh_val,
720  uint64_t* h_val,
721  bool& hStn)
722 {
723  uint64_t b_val = 0, t_val = 0;
724  b_val = NTC64(char_out, char_in, k, fh_val, rh_val);
725  hStn = rh_val < fh_val;
726  h_val[0] = b_val;
727  for (unsigned i = 1; i < m; i++) {
728  t_val = b_val * (i ^ k * multiseed);
729  t_val ^= t_val >> multishift;
730  h_val[i] = t_val;
731  }
732 }
733 
734 // masking canonical ntHash using spaced seed pattern
735 inline uint64_t
736 maskHash(uint64_t& fk_val,
737  uint64_t& rk_val,
738  const char* seed_seq,
739  const char* kmer_seq,
740  const unsigned k)
741 {
742  uint64_t fs_val = fk_val, rs_val = rk_val;
743  for (unsigned i = 0; i < k; i++) {
744  if (seed_seq[i] != '1') {
745  fs_val ^=
746  (ms_tab_31l[(unsigned char)kmer_seq[i]][(k - 1 - i) % 31] | // NOLINT
747  ms_tab_33r[(unsigned char)kmer_seq[i]][(k - 1 - i) % 33]); // NOLINT
748  rs_val ^=
749  (ms_tab_31l[(unsigned char)kmer_seq[i] & cp_off][i % 31] | // NOLINT
750  ms_tab_33r[(unsigned char)kmer_seq[i] & cp_off][i % 33]); // NOLINT
751  }
752  }
753  return (rs_val < fs_val) ? rs_val : fs_val;
754 }
755 
756 // spaced seed ntHash for base kmer, i.e. fhval(kmer_0)
757 inline uint64_t
758 NTS64(const char* kmer_seq,
759  const std::vector<bool>& seed,
760  const unsigned k,
761  uint64_t& h_val)
762 {
763  h_val = 0;
764  uint64_t sVal = 0;
765  for (unsigned i = 0; i < k; i++) {
766  h_val = rol1(h_val);
767  h_val = swapbits033(h_val);
768  sVal = h_val;
769  h_val ^= seed_tab[(unsigned char)kmer_seq[i]];
770  if (seed[i]) {
771  sVal = h_val;
772  }
773  }
774  return sVal;
775 }
776 
777 // spaced seed ntHash for sliding k-mers
778 inline uint64_t
779 NTS64(const char* kmer_seq,
780  const std::vector<bool>& seed,
781  const unsigned char char_out,
782  const unsigned char char_in,
783  const unsigned k,
784  uint64_t& h_val)
785 {
786  h_val = NTF64(h_val, k, char_out, char_in);
787  uint64_t sVal = h_val;
788  for (unsigned i = 0; i < k; i++) {
789  if (!seed[i]) {
790  sVal ^= (ms_tab_31l[(unsigned char)kmer_seq[i]][k % 31] | // NOLINT
791  ms_tab_33r[(unsigned char)kmer_seq[i]][k % 33]); // NOLINT
792  }
793  }
794  return sVal;
795 }
796 
797 // strand-aware multihash spaced seed ntHash
798 inline bool
799 NTMS64(const char* kmer_seq,
800  const std::vector<std::vector<unsigned>>& seed_seq,
801  const unsigned k,
802  const unsigned m,
803  uint64_t& fh_val,
804  uint64_t& rh_val,
805  unsigned& locN,
806  uint64_t* h_val,
807  bool* hStn)
808 {
809  fh_val = rh_val = 0;
810  locN = 0;
811  for (int i = int(k - 1); i >= 0; i--) {
812  if (seed_tab[(unsigned char)kmer_seq[i]] == seedN) {
813  locN = i;
814  return false;
815  }
816  fh_val = rol1(fh_val);
817  fh_val = swapbits033(fh_val);
818  fh_val ^= seed_tab[(unsigned char)kmer_seq[k - 1 - i]];
819 
820  rh_val = rol1(rh_val);
821  rh_val = swapbits033(rh_val);
822  rh_val ^= seed_tab[(unsigned char)kmer_seq[i] & cp_off];
823  }
824 
825  for (unsigned j = 0; j < m; j++) {
826  uint64_t fs_val = fh_val, rs_val = rh_val;
827  for (const auto& seed_pos : seed_seq[j]) {
828  fs_val ^= (ms_tab_31l[(unsigned char)kmer_seq[seed_pos]]
829  [(k - 1 - seed_pos) % 31] | // NOLINT
830  ms_tab_33r[(unsigned char)kmer_seq[seed_pos]]
831  [(k - 1 - seed_pos) % 33]); // NOLINT
832  rs_val ^= (ms_tab_31l[(unsigned char)kmer_seq[seed_pos] & cp_off]
833  [seed_pos % 31] | // NOLINT
834  ms_tab_33r[(unsigned char)kmer_seq[seed_pos] & cp_off]
835  [seed_pos % 33]); // NOLINT
836  }
837  hStn[j] = rs_val < fs_val;
838  h_val[j] = hStn[j] ? rs_val : fs_val;
839  }
840  return true;
841 }
842 
843 // strand-aware multihash spaced seed ntHash for sliding k-mers
844 inline void
845 NTMS64(const char* kmer_seq,
846  const std::vector<std::vector<unsigned>>& seed_seq,
847  const unsigned char char_out,
848  const unsigned char char_in,
849  const unsigned k,
850  const unsigned m,
851  uint64_t& fh_val,
852  uint64_t& rh_val,
853  uint64_t* h_val,
854  bool* hStn)
855 {
856  fh_val = NTF64(fh_val, k, char_out, char_in);
857  rh_val = NTR64(rh_val, k, char_out, char_in);
858  for (unsigned j = 0; j < m; j++) {
859  uint64_t fs_val = fh_val, rs_val = rh_val;
860  for (const auto& seed_pos : seed_seq[j]) {
861  fs_val ^= (ms_tab_31l[(unsigned char)kmer_seq[seed_pos]]
862  [(k - 1 - seed_pos) % 31] | // NOLINT
863  ms_tab_33r[(unsigned char)kmer_seq[seed_pos]]
864  [(k - 1 - seed_pos) % 33]); // NOLINT
865  rs_val ^= (ms_tab_31l[(unsigned char)kmer_seq[seed_pos] & cp_off]
866  [seed_pos % 31] | // NOLINT
867  ms_tab_33r[(unsigned char)kmer_seq[seed_pos] & cp_off]
868  [seed_pos % 33]); // NOLINT
869  ;
870  }
871  hStn[j] = rs_val < fs_val;
872  h_val[j] = hStn[j] ? rs_val : fs_val;
873  }
874 }
875 
876 // Multi spaced seed ntHash with multiple hashes per seed
877 inline bool
878 NTMSM64(const char* kmer_seq,
879  const std::vector<std::vector<unsigned>>& seed_seq,
880  const unsigned k,
881  const unsigned m,
882  const unsigned m2,
883  uint64_t& fh_val,
884  uint64_t& rh_val,
885  unsigned& locN,
886  uint64_t* h_val)
887 {
888  fh_val = rh_val = 0;
889  locN = 0;
890  for (int i = int(k - 1); i >= 0; i--) {
891  if (seed_tab[(unsigned char)kmer_seq[i]] == seedN) {
892  locN = i;
893  return false;
894  }
895  fh_val = rol1(fh_val);
896  fh_val = swapbits033(fh_val);
897  fh_val ^= seed_tab[(unsigned char)kmer_seq[k - 1 - i]];
898 
899  rh_val = rol1(rh_val);
900  rh_val = swapbits033(rh_val);
901  rh_val ^= seed_tab[(unsigned char)kmer_seq[i] & cp_off];
902  }
903 
904  for (unsigned j = 0; j < m; j++) {
905  uint64_t fs_val = fh_val, rs_val = rh_val;
906  for (const auto& seed_pos : seed_seq[j]) {
907  fs_val ^= (ms_tab_31l[(unsigned char)kmer_seq[seed_pos]]
908  [(k - 1 - seed_pos) % 31] | // NOLINT
909  ms_tab_33r[(unsigned char)kmer_seq[seed_pos]]
910  [(k - 1 - seed_pos) % 33]); // NOLINT
911  rs_val ^= (ms_tab_31l[(unsigned char)kmer_seq[seed_pos] & cp_off]
912  [seed_pos % 31] | // NOLINT
913  ms_tab_33r[(unsigned char)kmer_seq[seed_pos] & cp_off]
914  [seed_pos % 33]); // NOLINT
915  }
916  h_val[j * m2] = rs_val < fs_val ? rs_val : fs_val;
917  for (unsigned j2 = 1; j2 < m2; j2++) {
918  uint64_t t_val = h_val[j * m2] * (j2 ^ k * multiseed);
919  t_val ^= t_val >> multishift;
920  h_val[j * m2 + j2] = t_val;
921  }
922  }
923  return true;
924 }
925 
926 // Multi spaced seed ntHash for sliding k-mers with multiple hashes per seed
927 inline void
928 NTMSM64(const char* kmer_seq,
929  const std::vector<std::vector<unsigned>>& seed_seq,
930  const unsigned char char_out,
931  const unsigned char char_in,
932  const unsigned k,
933  const unsigned m,
934  const unsigned m2,
935  uint64_t& fh_val,
936  uint64_t& rh_val,
937  uint64_t* h_val)
938 {
939  fh_val = NTF64(fh_val, k, char_out, char_in);
940  rh_val = NTR64(rh_val, k, char_out, char_in);
941  for (unsigned j = 0; j < m; j++) {
942  uint64_t fs_val = fh_val, rs_val = rh_val;
943  for (const auto& seed_pos : seed_seq[j]) {
944  fs_val ^= (ms_tab_31l[(unsigned char)kmer_seq[seed_pos]]
945  [(k - 1 - seed_pos) % 31] | // NOLINT
946  ms_tab_33r[(unsigned char)kmer_seq[seed_pos]]
947  [(k - 1 - seed_pos) % 33]); // NOLINT
948  rs_val ^= (ms_tab_31l[(unsigned char)kmer_seq[seed_pos] & cp_off]
949  [seed_pos % 31] | // NOLINT
950  ms_tab_33r[(unsigned char)kmer_seq[seed_pos] & cp_off]
951  [seed_pos % 33]); // NOLINT
952  }
953  h_val[j * m2] = rs_val < fs_val ? rs_val : fs_val;
954  for (unsigned j2 = 1; j2 < m2; j2++) {
955  uint64_t t_val = h_val[j * m2] * (j2 ^ k * multiseed);
956  t_val ^= t_val >> multishift;
957  h_val[j * m2 + j2] = t_val;
958  }
959  }
960 }
961 
962 class NtHash;
963 class SeedNtHash;
964 using SpacedSeed = std::vector<unsigned>;
965 static std::vector<SpacedSeed>
966 parse_seeds(const std::vector<std::string>& seed_strings);
967 
976 class NtHash
977 {
978 
979 public:
987  NtHash(const char* seq, size_t seq_len, unsigned k, unsigned hash_num);
988 
995  NtHash(const std::string& seq, unsigned k, unsigned hash_num);
996 
1001  bool roll();
1002 
1003  const uint64_t* hashes() const;
1004 
1005  size_t get_pos() const { return pos; }
1006  unsigned get_k() const { return k; }
1007  unsigned get_hash_num() const { return hash_num; }
1008 
1009 protected:
1011  bool init();
1012 
1013  const char* seq;
1014  const size_t seq_len;
1015  const unsigned k;
1016  const unsigned hash_num;
1017  size_t pos = 0;
1018  std::vector<uint64_t> hashes_vector;
1019  uint64_t forward_hash = 0;
1020  uint64_t reverse_hash = 0;
1021 };
1022 
1023 class SeedNtHash : public NtHash
1024 {
1025 
1026 public:
1027  SeedNtHash(const char* seq,
1028  size_t seq_len,
1029  unsigned k,
1030  const std::vector<SpacedSeed>& seeds,
1031  unsigned hash_num_per_seed);
1032  SeedNtHash(const std::string& seq,
1033  unsigned k,
1034  const std::vector<SpacedSeed>& seeds,
1035  unsigned hash_num_per_seed);
1036  SeedNtHash(const char* seq,
1037  size_t seq_len,
1038  unsigned k,
1039  const std::vector<std::string>& seeds,
1040  unsigned hash_num_per_seed);
1041  SeedNtHash(const std::string& seq,
1042  unsigned k,
1043  const std::vector<std::string>& seeds,
1044  unsigned hash_num_per_seed);
1045 
1046  unsigned get_hash_num_per_seed() const { return hash_num_per_seed; }
1047 
1048  bool roll();
1049 
1050 private:
1051  bool init();
1052 
1053  const unsigned hash_num_per_seed;
1054  std::vector<SpacedSeed> seeds;
1055 };
1056 
1057 inline NtHash::NtHash(const char* seq,
1058  size_t seq_len,
1059  unsigned k,
1060  unsigned hash_num)
1061  : seq(seq)
1062  , seq_len(seq_len)
1063  , k(k)
1064  , hash_num(hash_num)
1065 {
1066  hashes_vector.resize(hash_num);
1067 }
1068 
1069 inline NtHash::NtHash(const std::string& seq, unsigned k, unsigned hash_num)
1070  : NtHash(seq.c_str(), seq.size(), k, hash_num)
1071 {}
1072 
1073 inline SeedNtHash::SeedNtHash(const char* seq,
1074  size_t seq_len,
1075  unsigned k,
1076  const std::vector<SpacedSeed>& seeds,
1077  unsigned hash_num_per_seed)
1078  : NtHash(seq, seq_len, k, seeds.size() * hash_num_per_seed)
1079  , hash_num_per_seed(hash_num_per_seed)
1080  , seeds(seeds)
1081 {}
1082 
1083 inline SeedNtHash::SeedNtHash(const std::string& seq,
1084  unsigned k,
1085  const std::vector<SpacedSeed>& seeds,
1086  unsigned hash_num_per_seed)
1087  : NtHash(seq, k, seeds.size() * hash_num_per_seed)
1088  , hash_num_per_seed(hash_num_per_seed)
1089  , seeds(seeds)
1090 {}
1091 
1092 inline SeedNtHash::SeedNtHash(const char* seq,
1093  size_t seq_len,
1094  unsigned k,
1095  const std::vector<std::string>& seeds,
1096  unsigned hash_num_per_seed)
1097  : NtHash(seq, seq_len, k, seeds.size() * hash_num_per_seed)
1098  , hash_num_per_seed(hash_num_per_seed)
1099  , seeds(parse_seeds(seeds))
1100 {}
1101 
1102 inline SeedNtHash::SeedNtHash(const std::string& seq,
1103  unsigned k,
1104  const std::vector<std::string>& seeds,
1105  unsigned hash_num_per_seed)
1106  : NtHash(seq, k, seeds.size() * hash_num_per_seed)
1107  , hash_num_per_seed(hash_num_per_seed)
1108  , seeds(parse_seeds(seeds))
1109 {}
1110 
1111 static std::vector<SpacedSeed>
1112 parse_seeds(const std::vector<std::string>& seed_strings)
1113 {
1114  std::vector<SpacedSeed> seed_set;
1115  for (const auto& seed_string : seed_strings) {
1116  SpacedSeed seed;
1117  size_t pos = 0;
1118  for (const auto& c : seed_string) {
1119  if (c != '1') {
1120  seed.push_back(pos);
1121  }
1122  ++pos;
1123  }
1124  seed_set.push_back(seed);
1125  }
1126  return seed_set;
1127 }
1128 
1129 // NOLINTNEXTLINE
1130 #define NT_HASH_INIT(CLASS, NTHASH_CALL) \
1131  inline bool CLASS::init() \
1132  { \
1133  if (k > seq_len) { \
1134  pos = std::numeric_limits<std::size_t>::max(); \
1135  return false; \
1136  } \
1137  unsigned posN = 0; \
1138  while ((pos < seq_len - k + 1) && !(NTHASH_CALL)) { \
1139  pos += posN + 1; \
1140  } \
1141  if (pos > seq_len - k) { \
1142  pos = std::numeric_limits<std::size_t>::max(); \
1143  return false; \
1144  } \
1145  ++pos; \
1146  return true; \
1147  }
1148 
1149 // NOLINTNEXTLINE
1150 #define NT_HASH_ROLL(CLASS, NTHASH_CALL) \
1151  inline bool CLASS::roll() \
1152  { \
1153  if (pos == 0) { \
1154  return init(); \
1155  } \
1156  if (pos > seq_len - k) { \
1157  return false; \
1158  } \
1159  if (seed_tab[(unsigned char)(seq[pos + k - 1])] == seedN) { \
1160  pos += k; \
1161  return init(); \
1162  } \
1163  (NTHASH_CALL); \
1164  ++pos; \
1165  return true; \
1166  }
1167 
1168 NT_HASH_INIT(NtHash,
1169  NTMC64(seq + pos,
1170  k,
1171  hash_num,
1172  forward_hash,
1173  reverse_hash,
1174  posN,
1175  hashes_vector.data()))
1176 NT_HASH_ROLL(NtHash,
1177  NTMC64(seq[pos - 1],
1178  seq[pos - 1 + k],
1179  k,
1180  hash_num,
1181  forward_hash,
1182  reverse_hash,
1183  hashes_vector.data()))
1184 
1185 NT_HASH_INIT(SeedNtHash,
1186  NTMSM64(seq + pos,
1187  seeds,
1188  k,
1189  seeds.size(),
1190  hash_num_per_seed,
1191  forward_hash,
1192  reverse_hash,
1193  posN,
1194  hashes_vector.data()))
1195 NT_HASH_ROLL(SeedNtHash,
1196  NTMSM64(seq + pos,
1197  seeds,
1198  seq[pos - 1],
1199  seq[pos - 1 + k],
1200  k,
1201  seeds.size(),
1202  hash_num_per_seed,
1203  forward_hash,
1204  reverse_hash,
1205  hashes_vector.data()))
1206 
1207 #undef NT_HASH_INIT
1208 #undef NT_HASH_ROLL
1209 
1210 inline const uint64_t*
1211 NtHash::hashes() const
1212 {
1213  return hashes_vector.data();
1214 }
1215 
1216 } // namespace btllib
1217 
1218 #endif
btllib::NtHash::init
bool init()
btllib::NtHash::NtHash
NtHash(const char *seq, size_t seq_len, unsigned k, unsigned hash_num)
Definition: nthash.hpp:1057
btllib::SeedNtHash
Definition: nthash.hpp:1024
btllib::NtHash::roll
bool roll()
btllib::NtHash
Definition: nthash.hpp:977