btllib
seq_reader.hpp
1 #ifndef BTLLIB_SEQ_READER_HPP
2 #define BTLLIB_SEQ_READER_HPP
3 
4 #include "data_stream.hpp"
5 #include "index_queue.hpp"
6 #include "seq.hpp"
7 #include "status.hpp"
8 
9 #include <algorithm>
10 #include <atomic>
11 #include <cassert>
12 #include <cctype>
13 #include <condition_variable>
14 #include <cstdio>
15 #include <cstring>
16 #include <mutex>
17 #include <stack>
18 #include <string>
19 #include <thread>
20 
21 namespace btllib {
22 
24 class SeqReader
25 {
26 public:
27  enum Flag
28  {
30  FOLD_CASE = 0,
31  NO_FOLD_CASE = 1,
35  TRIM_MASKED = 2
36  };
37 
38  SeqReader(const std::string& source_path, int flags = 0);
39  ~SeqReader();
40 
41  void close() noexcept;
42 
43  bool flagFoldCase() const { return bool(~flags & NO_FOLD_CASE); }
44  bool flagTrimMasked() const { return bool(flags & TRIM_MASKED); }
45 
46  enum Format
47  {
48  UNDETERMINED,
49  FASTA,
50  FASTQ,
51  SAM,
52  GFA2,
53  INVALID
54  };
55 
56  Format get_format() const { return format; }
57 
58  struct Record
59  {
60  size_t num = -1;
61  std::string name;
62  std::string comment;
63  std::string seq;
64  std::string qual;
65 
66  operator bool() const { return !seq.empty(); }
67  };
68 
70  Record read();
71 
72 private:
73  const std::string& source_path;
74  DataSource source;
75  unsigned flags = 0;
76  Format format = UNDETERMINED; // Format of the source file
77  bool closed = false;
78 
79  static const size_t DETERMINE_FORMAT_CHARS = 2048;
80  static const size_t BUFFER_SIZE = DETERMINE_FORMAT_CHARS;
81 
82  char* buffer = nullptr;
83  size_t buffer_start = 0;
84  size_t buffer_end = 0;
85  bool eof_newline_inserted = false;
86 
87  static const size_t RECORD_QUEUE_SIZE = 32;
88  static const size_t RECORD_BLOCK_SIZE = 128;
89 
90  static const size_t CSTRING_DEFAULT_CAP = 4096;
91 
92  static const size_t MAX_SIMULTANEOUS_SEQREADERS = 256;
93 
94  struct CString
95  {
96 
97  CString() { s[0] = '\0'; }
98  CString(const CString&) = delete;
99  CString(CString&& cstring) noexcept
100  {
101  std::swap(s, cstring.s);
102  size = cstring.size;
103  cstring.clear();
104  std::swap(cap, cstring.cap);
105  }
106  CString(const std::string& str)
107  {
108  if (str.size() + 1 > cap) {
109  cap = str.size() + 1;
110  s = (char*)std::realloc((char*)s, cap); // NOLINT
111  }
112  size = str.size();
113  memcpy(s, str.c_str(), size + 1);
114  }
115 
116  CString& operator=(const CString&) = delete;
117  CString& operator=(CString&& cstring) noexcept
118  {
119  std::swap(s, cstring.s);
120  size = cstring.size;
121  cstring.clear();
122  std::swap(cap, cstring.cap);
123  return *this;
124  }
125  CString& operator=(const std::string& str)
126  {
127  if (str.size() + 1 > cap) {
128  cap = str.size() + 1;
129  s = (char*)std::realloc((char*)s, cap); // NOLINT
130  }
131  size = str.size();
132  memcpy(s, str.c_str(), size + 1);
133  return *this;
134  }
135 
136  ~CString() { free(s); } // NOLINT
137 
138  void clear()
139  {
140  s[0] = '\0';
141  size = 0;
142  }
143  bool empty() const { return (ssize_t)size <= 0; }
144 
145  operator char*() const { return s; }
146 
147  char* s = (char*)std::malloc(CSTRING_DEFAULT_CAP); // NOLINT
148  size_t size = 0;
149  size_t cap = CSTRING_DEFAULT_CAP;
150  };
151 
152  struct RecordCString
153  {
154 
155  RecordCString() = default;
156  RecordCString(const RecordCString&) = delete;
157  RecordCString(RecordCString&& record) = default;
158 
159  RecordCString& operator=(const RecordCString&) = delete;
160  RecordCString& operator=(RecordCString&& record) = default;
161 
162  CString header;
163  CString seq;
164  CString qual;
165  };
166 
167  struct RecordCString2
168  {
169 
170  RecordCString2() = default;
171  RecordCString2(const RecordCString2&) = delete;
172  RecordCString2(RecordCString2&& record) = default;
173 
174  RecordCString2& operator=(const RecordCString2&) = delete;
175  RecordCString2& operator=(RecordCString2&& record) = default;
176 
177  CString header;
178  std::string seq;
179  CString qual;
180  };
181 
182  struct RecordCString3
183  {
184 
185  RecordCString3() = default;
186  RecordCString3(const RecordCString3&) = delete;
187  RecordCString3(RecordCString3&& record) = default;
188 
189  RecordCString3& operator=(const RecordCString3&) = delete;
190  RecordCString3& operator=(RecordCString3&& record) = default;
191 
192  CString header;
193  std::string seq;
194  std::string qual;
195  };
196 
197  CString tmp;
198 
199  std::thread* reader_thread = nullptr;
200  std::thread* seq_copier_thread = nullptr;
201  std::thread* qual_copier_thread = nullptr;
202  std::thread* postprocessor_thread = nullptr;
203  std::mutex format_mutex;
204  std::condition_variable format_cv;
205  std::atomic<bool> reader_end;
206  RecordCString* reader_record = nullptr;
207  IndexQueueSPSC<RecordCString, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>
208  reader_queue;
209  IndexQueueSPMC<RecordCString2, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>
210  seq_copier_queue;
211  IndexQueueSPMC<RecordCString3, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>
212  qual_copier_queue;
213  IndexQueueSPMC<Record, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>
214  postprocessor_queue;
215 
216  // I am crying at this code, but until C++17 compliant compilers are
217  // widespread, this cannot be a static inline variable
218  static IndexQueueSPMC<Record, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>::Block*
219  ready_records_array()
220  {
221  thread_local static IndexQueueSPMC<Record,
222  RECORD_QUEUE_SIZE,
223  RECORD_BLOCK_SIZE>::Block
224  _ready_records_array[MAX_SIMULTANEOUS_SEQREADERS];
225  return _ready_records_array;
226  }
227 
228  // Also cry worthy
229  static Record** ready_record_array()
230  {
231  thread_local static Record*
232  _ready_record_array[MAX_SIMULTANEOUS_SEQREADERS];
233  return _ready_record_array;
234  }
235 
236  // Bad code bad
237  static std::stack<unsigned>& recycled_ids() noexcept
238  {
239  static std::stack<unsigned> _recycled_ids;
240  return _recycled_ids;
241  }
242 
243  // ;-;
244  static std::mutex& recycled_ids_mutex() noexcept
245  {
246  static std::mutex _recycled_ids_mutex;
247  return _recycled_ids_mutex;
248  };
249 
250  // :(
251  static unsigned& last_id()
252  {
253  static unsigned _last_id = 0;
254  return _last_id;
255  }
256 
257  void generate_id();
258  void recycle_id() const noexcept;
259  unsigned id = 0;
260 
261  void determine_format();
262  void start_reader();
263  void start_seq_copier();
264  void start_qual_copier();
265  void start_postprocessor();
266 
267  bool load_buffer();
268 
269  bool is_fasta_buffer();
270  bool is_fastq_buffer();
271  bool is_sam_buffer();
272  bool is_gfa2_buffer();
273 
274  bool readline_buffer_append(CString& s);
275  void readline_file(CString& s);
276  void readline_file_append(CString& s);
277 
278  int read_stage = 0;
279 
280  struct read_fasta_buffer;
281  struct read_fastq_buffer;
282  struct read_sam_buffer;
283  struct read_gfa2_buffer;
284 
285  struct read_fasta_transition;
286  struct read_fastq_transition;
287  struct read_sam_transition;
288  struct read_gfa2_transition;
289 
290  struct read_fasta_file;
291  struct read_fastq_file;
292  struct read_sam_file;
293  struct read_gfa2_file;
294 
295  template<typename F>
296  void read_from_buffer(
297  F f,
298  IndexQueueSPSC<RecordCString, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>::Block&
299  records,
300  size_t& counter);
301 
302  template<typename F>
303  void read_transition(
304  F f,
305  IndexQueueSPSC<RecordCString, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>::Block&
306  records,
307  size_t& counter);
308 
309  template<typename F>
310  void read_from_file(
311  F f,
312  IndexQueueSPSC<RecordCString, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>::Block&
313  records,
314  size_t& counter);
315 
316  void postprocess();
317 };
318 
319 inline SeqReader::SeqReader(const std::string& source_path, int flags)
320  : source_path(source_path)
321  , source(source_path)
322  , flags(flags)
323  , reader_end(false)
324 {
325  buffer = new char[BUFFER_SIZE];
326  generate_id();
327  start_seq_copier();
328  start_qual_copier();
329  start_postprocessor();
330  {
331  std::unique_lock<std::mutex> lock(format_mutex);
332  start_reader();
333  format_cv.wait(lock);
334  }
335 }
336 
337 inline SeqReader::~SeqReader()
338 {
339  recycle_id();
340  close();
341  delete[] buffer;
342  delete reader_thread;
343  delete seq_copier_thread;
344  delete qual_copier_thread;
345  delete postprocessor_thread;
346 }
347 
348 inline void
349 SeqReader::generate_id()
350 {
351  std::unique_lock<std::mutex> lock(recycled_ids_mutex());
352  if (recycled_ids().empty()) {
353  id = ++last_id();
354  } else {
355  id = recycled_ids().top();
356  recycled_ids().pop();
357  }
358 }
359 
360 inline void
361 SeqReader::recycle_id() const noexcept
362 {
363  try {
364  std::unique_lock<std::mutex> lock(recycled_ids_mutex());
365  recycled_ids().push(id);
366  } catch (const std::exception& e) {
367  log_error("SeqReader id recycle error: " + std::string(e.what()));
368  std::exit(EXIT_FAILURE);
369  }
370 }
371 
372 inline void
373 SeqReader::close() noexcept
374 {
375  if (!closed) {
376  try {
377  closed = true;
378  reader_end = true;
379  postprocessor_queue.close();
380  postprocessor_thread->join();
381  qual_copier_queue.close();
382  qual_copier_thread->join();
383  seq_copier_queue.close();
384  seq_copier_thread->join();
385  reader_queue.close();
386  reader_thread->join();
387  source.close();
388  } catch (const std::system_error& e) {
389  log_error("SeqReader thread join failure: " + std::string(e.what()));
390  std::exit(EXIT_FAILURE);
391  }
392  }
393 }
394 
395 inline bool
396 SeqReader::load_buffer()
397 {
398  buffer_start = 0;
399  char last = buffer_end > 0 ? buffer[buffer_end - 1] : char(0);
400  buffer_end = 0;
401  do {
402  buffer_end +=
403  fread(buffer + buffer_end, 1, BUFFER_SIZE - buffer_end, source);
404  } while (buffer_end < BUFFER_SIZE && !bool(std::feof(source)));
405 
406  if (bool(std::feof(source)) && !eof_newline_inserted) {
407  if (buffer_end < BUFFER_SIZE) {
408  if ((buffer_end == 0 && last != '\n') ||
409  (buffer_end > 0 && buffer[buffer_end - 1] != '\n')) {
410  buffer[buffer_end++] = '\n';
411  }
412  eof_newline_inserted = true;
413  } else if (buffer[BUFFER_SIZE - 1] == '\n') {
414  eof_newline_inserted = true;
415  }
416  return true;
417  }
418  return bool(buffer_end);
419 }
420 
421 inline bool
422 SeqReader::is_fasta_buffer()
423 {
424  size_t current = buffer_start;
425  unsigned char c;
426  enum State
427  {
428  IN_HEADER_1,
429  IN_HEADER_2,
430  IN_SEQ
431  };
432  State state = IN_HEADER_1;
433  while (current < buffer_start + DETERMINE_FORMAT_CHARS &&
434  current < buffer_end) {
435  c = buffer[current];
436  switch (state) {
437  case IN_HEADER_1:
438  if (c == '>') {
439  state = IN_HEADER_2;
440  } else {
441  return false;
442  }
443  break;
444  case IN_HEADER_2:
445  if (c == '\n') {
446  state = IN_SEQ;
447  }
448  break;
449  case IN_SEQ:
450  if (c == '\n') {
451  state = IN_HEADER_1;
452  } else if (!bool(COMPLEMENTS[c])) {
453  return false;
454  }
455  break;
456  }
457  current++;
458  }
459  return true;
460 }
461 
462 inline bool
463 SeqReader::is_fastq_buffer()
464 {
465  size_t current = buffer_start;
466  unsigned char c;
467  enum State
468  {
469  IN_HEADER_1,
470  IN_HEADER_2,
471  IN_SEQ,
472  IN_PLUS_1,
473  IN_PLUS_2,
474  IN_QUAL
475  };
476  State state = IN_HEADER_1;
477  while (current < buffer_start + DETERMINE_FORMAT_CHARS &&
478  current < buffer_end) {
479  c = buffer[current];
480  switch (state) {
481  case IN_HEADER_1:
482  if (c == '@') {
483  state = IN_HEADER_2;
484  } else {
485  return false;
486  }
487  break;
488  case IN_HEADER_2:
489  if (c == '\n') {
490  state = IN_SEQ;
491  }
492  break;
493  case IN_SEQ:
494  if (c == '\n') {
495  state = IN_PLUS_1;
496  } else if (!bool(COMPLEMENTS[c])) {
497  return false;
498  }
499  break;
500  case IN_PLUS_1:
501  if (c == '+') {
502  state = IN_PLUS_2;
503  } else {
504  return false;
505  }
506  break;
507  case IN_PLUS_2:
508  if (c == '\n') {
509  state = IN_QUAL;
510  }
511  break;
512  case IN_QUAL:
513  if (c == '\n') {
514  state = IN_HEADER_1;
515  } else if (c < '!' || c > '~') {
516  return false;
517  }
518  break;
519  }
520  current++;
521  }
522  return true;
523 }
524 
525 inline bool
526 SeqReader::is_sam_buffer()
527 {
528  enum Column
529  {
530  QNAME = 1,
531  FLAG,
532  RNAME,
533  POS,
534  MAPQ,
535  CIGAR,
536  RNEXT,
537  PNEXT,
538  TLEN,
539  SEQ,
540  QUAL
541  };
542 
543  size_t current = buffer_start;
544 
545  while (current < buffer_start + DETERMINE_FORMAT_CHARS &&
546  current < buffer_end && buffer[current] == '@') {
547  while (current < buffer_start + DETERMINE_FORMAT_CHARS &&
548  current < buffer_end && buffer[current] != '\n') {
549  current++;
550  }
551  current++;
552  }
553 
554  int column = 1;
555  unsigned char c;
556  while (current < buffer_start + DETERMINE_FORMAT_CHARS &&
557  current < buffer_end) {
558  c = buffer[current];
559  if (c == '\n') {
560  break;
561  }
562  if (c == '\t') {
563  if (current > 0 && !bool(std::isspace(buffer[current - 1]))) {
564  column++;
565  } else {
566  return false;
567  }
568  } else {
569  switch (Column(column)) {
570  case QNAME:
571  if (bool(std::isspace(c))) {
572  return false;
573  }
574  break;
575  case FLAG:
576  if (!bool(std::isdigit(c))) {
577  return false;
578  }
579  break;
580  case RNAME:
581  if (bool(std::isspace(c))) {
582  return false;
583  }
584  break;
585  case POS:
586  if (!bool(std::isdigit(c))) {
587  return false;
588  }
589  break;
590  case MAPQ:
591  if (!bool(std::isdigit(c))) {
592  return false;
593  }
594  break;
595  case CIGAR:
596  if (bool(std::isspace(c))) {
597  return false;
598  }
599  break;
600  case RNEXT:
601  if (bool(std::isspace(c))) {
602  return false;
603  }
604  break;
605  case PNEXT:
606  if (!bool(std::isdigit(c))) {
607  return false;
608  }
609  break;
610  case TLEN:
611  if (!bool(std::isdigit(c))) {
612  return false;
613  }
614  break;
615  case SEQ:
616  if (!bool(COMPLEMENTS[c])) {
617  return false;
618  }
619  break;
620  case QUAL:
621  if (bool(std::isspace(c))) {
622  return false;
623  }
624  break;
625  default:
626  break;
627  }
628  }
629  current++;
630  }
631 
632  return current >= buffer_end || column >= QUAL;
633 }
634 
635 inline bool
636 SeqReader::is_gfa2_buffer()
637 {
638  const unsigned char specs[] = { 'H', 'S', 'F', 'E', 'G', 'O', 'U' };
639 
640  enum State
641  {
642  IN_ID,
643  IN_ID_TAB,
644  IN_REST,
645  IN_IGNORED
646  };
647 
648  auto is_a_spec = [&](unsigned char c) {
649  bool found = false;
650  for (unsigned char spec : specs) {
651  if (c == spec) {
652  found = true;
653  break;
654  }
655  }
656  return found;
657  };
658 
659  State state = is_a_spec(buffer[0]) ? IN_ID : IN_IGNORED;
660  bool has_id = false;
661  size_t current = buffer_start;
662  unsigned char c;
663  while (current < buffer_start + DETERMINE_FORMAT_CHARS &&
664  current < buffer_end) {
665  c = buffer[current];
666  switch (state) {
667  case IN_ID:
668  if (!is_a_spec(c)) {
669  return false;
670  }
671  has_id = true;
672  state = IN_ID_TAB;
673  break;
674  case IN_ID_TAB:
675  if (c != '\t') {
676  return false;
677  }
678  state = IN_REST;
679  break;
680  case IN_REST:
681  if (c == '\n') {
682  if (current + 1 < buffer_end) {
683  state = is_a_spec(buffer[current + 1]) ? IN_ID : IN_IGNORED;
684  }
685  }
686  break;
687  case IN_IGNORED:
688  if (c == '\n') {
689  if (current + 1 < buffer_end) {
690  state = is_a_spec(buffer[current + 1]) ? IN_ID : IN_IGNORED;
691  }
692  }
693  break;
694  default:
695  break;
696  }
697  current++;
698  }
699 
700  return has_id;
701 }
702 
703 inline void
704 SeqReader::determine_format()
705 {
706  load_buffer();
707  bool empty = buffer_end - buffer_start == 1;
708  check_warning(empty, std::string(source_path) + " is empty.");
709 
710  if (empty) {
711  return;
712  }
713 
714  if (is_fasta_buffer()) {
715  format = Format::FASTA;
716  } else if (is_fastq_buffer()) {
717  format = Format::FASTQ;
718  } else if (is_sam_buffer()) {
719  format = Format::SAM;
720  } else if (is_gfa2_buffer()) {
721  format = Format::GFA2;
722  } else {
723  format = Format::INVALID;
724  log_error(std::string(source_path) + " source file is in invalid format!");
725  std::exit(EXIT_FAILURE);
726  }
727 }
728 
729 inline bool
730 SeqReader::readline_buffer_append(CString& s)
731 {
732  char c = char(0);
733  for (; buffer_start < buffer_end && (c = buffer[buffer_start]) != '\n';
734  ++buffer_start) {
735  if (s.size >= s.cap) {
736  s.cap *= 2;
737  s.s = (char*)std::realloc((char*)(s.s), s.cap); // NOLINT
738  }
739  s.s[s.size++] = c;
740  }
741  if (s.size >= s.cap) {
742  s.cap *= 2;
743  s.s = (char*)std::realloc((char*)(s.s), s.cap); // NOLINT
744  }
745  s.s[s.size] = '\0';
746  if (c == '\n') {
747  ++buffer_start;
748  return true;
749  }
750  return false;
751 }
752 
753 inline void
754 SeqReader::readline_file(CString& s)
755 {
756  s.size = getline(&(s.s), &(s.cap), source);
757 }
758 
759 inline void
760 SeqReader::readline_file_append(CString& s)
761 {
762  readline_file(tmp);
763  if (s.size + tmp.size + 1 > s.cap) {
764  s.cap = s.size + tmp.size + 1;
765  s.s = (char*)std::realloc((char*)(s.s), s.cap); // NOLINT
766  }
767  memcpy(s.s + s.size, tmp.s, tmp.size + 1);
768  s.size += tmp.size;
769 }
770 
771 // NOLINTNEXTLINE
772 #define READ_SAM(READLINE_SECTION, MIDEND_SECTION, END_SECTION) \
773  enum Column \
774  { \
775  QNAME = 1, \
776  FLAG, \
777  RNAME, \
778  POS, \
779  MAPQ, \
780  CIGAR, \
781  RNEXT, \
782  PNEXT, \
783  TLEN, \
784  SEQ, \
785  QUAL \
786  }; \
787  for (;;) { \
788  READLINE_SECTION \
789  std::string tmp_string = seq_reader.tmp.s; \
790  if (tmp_string.length() > 0 && tmp_string[0] != '@') { \
791  size_t pos = 0, pos2 = 0, pos3 = 0; \
792  pos2 = tmp_string.find('\t'); \
793  if (tmp_string.size() + 1 > seq_reader.reader_record->header.cap) { \
794  seq_reader.reader_record->header.cap = tmp_string.size() + 1; \
795  seq_reader.reader_record->header.s = \
796  (char*)std::realloc((char*)(seq_reader.reader_record->header), \
797  seq_reader.reader_record->header.cap); \
798  } \
799  seq_reader.reader_record->header = tmp_string.substr(0, pos2); \
800  for (int i = 0; i < int(SEQ) - 1; i++) { \
801  pos = tmp_string.find('\t', pos + 1); \
802  } \
803  pos2 = tmp_string.find('\t', pos + 1); \
804  pos3 = tmp_string.find('\t', pos2 + 1); \
805  if (pos3 == std::string::npos) { \
806  pos3 = tmp_string.length(); \
807  } \
808  if (tmp_string.size() + 1 > seq_reader.reader_record->seq.cap) { \
809  seq_reader.reader_record->seq.cap = tmp_string.size() + 1; \
810  seq_reader.reader_record->seq.s = \
811  (char*)std::realloc((char*)(seq_reader.reader_record->seq.s), \
812  seq_reader.reader_record->seq.cap); \
813  } \
814  if (tmp_string.size() + 1 > seq_reader.reader_record->qual.cap) { \
815  seq_reader.reader_record->qual.cap = tmp_string.size() + 1; \
816  seq_reader.reader_record->qual.s = \
817  (char*)std::realloc((char*)(seq_reader.reader_record->qual.s), \
818  seq_reader.reader_record->qual.cap); \
819  } \
820  seq_reader.reader_record->seq = \
821  tmp_string.substr(pos + 1, pos2 - pos - 1); \
822  seq_reader.reader_record->qual = \
823  tmp_string.substr(pos2 + 1, pos3 - pos2 - 1); \
824  MIDEND_SECTION \
825  } \
826  seq_reader.tmp.clear(); \
827  END_SECTION \
828  }
829 
830 // NOLINTNEXTLINE
831 #define READ_GFA2(READLINE_SECTION, MIDEND_SECTION, END_SECTION) \
832  enum Column \
833  { \
834  S = 1, \
835  ID, \
836  LEN, \
837  SEQ \
838  }; \
839  for (;;) { \
840  READLINE_SECTION \
841  std::string tmp_string = seq_reader.tmp.s; \
842  if (tmp_string.length() > 0 && tmp_string[0] == 'S') { \
843  size_t pos = 0, pos2 = 0; \
844  pos2 = tmp_string.find('\t', 1); \
845  if (tmp_string.size() + 1 > seq_reader.reader_record->header.cap) { \
846  seq_reader.reader_record->header.cap = tmp_string.size() + 1; \
847  seq_reader.reader_record->header.s = \
848  (char*)std::realloc((char*)(seq_reader.reader_record->header.s), \
849  seq_reader.reader_record->header.cap); \
850  } \
851  seq_reader.reader_record->header = tmp_string.substr(1, pos2 - 1); \
852  for (int i = 0; i < int(SEQ) - 1; i++) { \
853  pos = tmp_string.find('\t', pos + 1); \
854  } \
855  pos2 = tmp_string.find('\t', pos + 1); \
856  if (pos2 == std::string::npos) { \
857  pos2 = tmp_string.length(); \
858  } \
859  if (tmp_string.size() + 1 > seq_reader.reader_record->seq.cap) { \
860  seq_reader.reader_record->seq.cap = tmp_string.size() + 1; \
861  seq_reader.reader_record->seq.s = \
862  (char*)std::realloc((char*)(seq_reader.reader_record->seq.s), \
863  seq_reader.reader_record->seq.cap); \
864  } \
865  seq_reader.reader_record->seq = \
866  tmp_string.substr(pos + 1, pos2 - pos - 1); \
867  MIDEND_SECTION \
868  } \
869  seq_reader.tmp.clear(); \
870  END_SECTION \
871  }
872 
873 struct SeqReader::read_fasta_buffer
874 {
875  bool operator()(SeqReader& seq_reader)
876  {
877  switch (seq_reader.read_stage) {
878  case 0: {
879  if (!seq_reader.readline_buffer_append(
880  seq_reader.reader_record->header)) {
881  return false;
882  }
883  ++seq_reader.read_stage;
884  }
885  // fall through
886  case 1: {
887  if (!seq_reader.readline_buffer_append(seq_reader.reader_record->seq)) {
888  return false;
889  }
890  seq_reader.read_stage = 0;
891  return true;
892  }
893  }
894  return false;
895  }
896 };
897 
898 struct SeqReader::read_fastq_buffer
899 {
900  bool operator()(SeqReader& seq_reader)
901  {
902  switch (seq_reader.read_stage) {
903  case 0: {
904  if (!seq_reader.readline_buffer_append(
905  seq_reader.reader_record->header)) {
906  return false;
907  }
908  ++seq_reader.read_stage;
909  }
910  // fall through
911  case 1: {
912  if (!seq_reader.readline_buffer_append(seq_reader.reader_record->seq)) {
913  return false;
914  }
915  ++seq_reader.read_stage;
916  }
917  // fall through
918  case 2: {
919  if (!seq_reader.readline_buffer_append(seq_reader.tmp)) {
920  return false;
921  }
922  ++seq_reader.read_stage;
923  seq_reader.tmp.clear();
924  }
925  // fall through
926  case 3: {
927  if (!seq_reader.readline_buffer_append(
928  seq_reader.reader_record->qual)) {
929  return false;
930  }
931  seq_reader.read_stage = 0;
932  return true;
933  }
934  }
935  return false;
936  }
937 };
938 
939 struct SeqReader::read_sam_buffer
940 {
941  bool operator()(SeqReader& seq_reader)
942  {
943  READ_SAM( // NOLINT
944  if (!seq_reader.readline_buffer_append( // NOLINT
945  seq_reader.tmp)) { return false; }, // NOLINT
946  seq_reader.tmp.clear(); // NOLINT
947  return true; // NOLINT
948  ,
949  if (seq_reader.buffer_start >= seq_reader.buffer_end) {
950  return false;
951  }) // NOLINT
952  }
953 };
954 
955 struct SeqReader::read_gfa2_buffer
956 {
957  bool operator()(SeqReader& seq_reader)
958  {
959  READ_GFA2( // NOLINT
960  if (!seq_reader.readline_buffer_append( // NOLINT
961  seq_reader.tmp)) { return false; }, // NOLINT
962  seq_reader.tmp.clear(); // NOLINT
963  return true; // NOLINT
964  ,
965  if (seq_reader.buffer_start >= seq_reader.buffer_end) {
966  return false;
967  }) // NOLINT
968  }
969 };
970 
971 struct SeqReader::read_fasta_transition
972 {
973  void operator()(SeqReader& seq_reader)
974  {
975  switch (seq_reader.read_stage) {
976  case 0: {
977  seq_reader.readline_file_append(seq_reader.reader_record->header);
978  ++seq_reader.read_stage;
979  }
980  // fall through
981  case 1: {
982  seq_reader.readline_file_append(seq_reader.reader_record->seq);
983  seq_reader.read_stage = 0;
984  }
985  }
986  }
987 };
988 
989 struct SeqReader::read_fastq_transition
990 {
991  void operator()(SeqReader& seq_reader)
992  {
993  switch (seq_reader.read_stage) {
994  case 0: {
995  seq_reader.readline_file_append(seq_reader.reader_record->header);
996  ++seq_reader.read_stage;
997  }
998  // fall through
999  case 1: {
1000  seq_reader.readline_file_append(seq_reader.reader_record->seq);
1001  ++seq_reader.read_stage;
1002  }
1003  // fall through
1004  case 2: {
1005  seq_reader.readline_file_append(seq_reader.tmp);
1006  ++seq_reader.read_stage;
1007  seq_reader.tmp.clear();
1008  }
1009  // fall through
1010  case 3: {
1011  seq_reader.readline_file_append(seq_reader.reader_record->qual);
1012  seq_reader.read_stage = 0;
1013  }
1014  }
1015  }
1016 };
1017 
1018 struct SeqReader::read_sam_transition
1019 {
1020  void operator()(SeqReader& seq_reader)
1021  {
1022  READ_SAM( // NOLINT
1023  seq_reader.readline_file_append(seq_reader.tmp); // NOLINT
1024  , , if (bool(feof(seq_reader.source))) { break; }) // NOLINT
1025  }
1026 };
1027 
1028 struct SeqReader::read_gfa2_transition
1029 {
1030  void operator()(SeqReader& seq_reader)
1031  {
1032  READ_GFA2( // NOLINT
1033  seq_reader.readline_file_append(seq_reader.tmp); // NOLINT
1034  , , if (bool(feof(seq_reader.source))) { break; }) // NOLINT
1035  }
1036 };
1037 
1038 struct SeqReader::read_fasta_file
1039 {
1040  void operator()(SeqReader& seq_reader)
1041  {
1042  seq_reader.readline_file(seq_reader.reader_record->header);
1043  seq_reader.readline_file(seq_reader.reader_record->seq);
1044  }
1045 };
1046 
1047 struct SeqReader::read_fastq_file
1048 {
1049  void operator()(SeqReader& seq_reader)
1050  {
1051  seq_reader.readline_file(seq_reader.reader_record->header);
1052  seq_reader.readline_file(seq_reader.reader_record->seq);
1053  seq_reader.readline_file(seq_reader.tmp);
1054  seq_reader.readline_file(seq_reader.reader_record->qual);
1055  }
1056 };
1057 
1058 struct SeqReader::read_sam_file
1059 {
1060  void operator()(SeqReader& seq_reader)
1061  {
1062  READ_SAM( // NOLINT
1063  seq_reader.readline_file(seq_reader.tmp); // NOLINT
1064  , , if (bool(feof(seq_reader.source))) { break; }) // NOLINT
1065  }
1066 };
1067 
1068 struct SeqReader::read_gfa2_file
1069 {
1070  void operator()(SeqReader& seq_reader)
1071  {
1072  READ_GFA2( // NOLINT
1073  seq_reader.readline_file(seq_reader.tmp); // NOLINT
1074  , , if (bool(feof(seq_reader.source))) { break; }) // NOLINT
1075  }
1076 };
1077 
1078 template<typename F>
1079 inline void
1080 SeqReader::read_from_buffer(
1081  F f,
1082  IndexQueueSPSC<RecordCString, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>::Block&
1083  records,
1084  size_t& counter)
1085 {
1086  for (; buffer_start < buffer_end && !reader_end;) {
1087  reader_record = &(records.data[records.count]);
1088  if (!f(*this) || reader_record->seq.empty()) {
1089  break;
1090  }
1091  records.count++;
1092  if (records.count == RECORD_BLOCK_SIZE) {
1093  records.current = 0;
1094  records.index = counter++;
1095  reader_queue.write(records);
1096  records.current = 0;
1097  records.count = 0;
1098  }
1099  }
1100 }
1101 
1102 template<typename F>
1103 inline void
1104 SeqReader::read_transition(
1105  F f,
1106  IndexQueueSPSC<RecordCString, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>::Block&
1107  records,
1108  size_t& counter)
1109 {
1110  if (std::ferror(source) == 0 && std::feof(source) == 0 && !reader_end) {
1111  int p = std::fgetc(source);
1112  if (p != EOF) {
1113  std::ungetc(p, source);
1114  reader_record = &(records.data[records.count]);
1115  f(*this);
1116  if (!reader_record->seq.empty()) {
1117  records.count++;
1118  if (records.count == RECORD_BLOCK_SIZE) {
1119  records.current = 0;
1120  records.index = counter++;
1121  reader_queue.write(records);
1122  records.current = 0;
1123  records.count = 0;
1124  }
1125  }
1126  }
1127  }
1128 }
1129 
1130 template<typename F>
1131 inline void
1132 SeqReader::read_from_file(
1133  F f,
1134  IndexQueueSPSC<RecordCString, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>::Block&
1135  records,
1136  size_t& counter)
1137 {
1138  for (; std::ferror(source) == 0 && std::feof(source) == 0 && !reader_end;) {
1139  reader_record = &(records.data[records.count]);
1140  f(*this);
1141  if (reader_record->seq.empty()) {
1142  break;
1143  }
1144  records.count++;
1145  if (records.count == RECORD_BLOCK_SIZE) {
1146  records.current = 0;
1147  records.index = counter++;
1148  reader_queue.write(records);
1149  records.current = 0;
1150  records.count = 0;
1151  }
1152  }
1153 }
1154 
1155 inline void
1156 SeqReader::start_reader()
1157 {
1158  reader_thread = new std::thread([this]() {
1159  {
1160  std::unique_lock<std::mutex> lock(format_mutex);
1161  determine_format();
1162  format_cv.notify_all();
1163  }
1164 
1165  size_t counter = 0;
1166  IndexQueueSPSC<RecordCString, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>::Block
1167  records;
1168  switch (format) {
1169  case FASTA: {
1170  read_from_buffer(read_fasta_buffer(), records, counter);
1171  read_transition(read_fasta_transition(), records, counter);
1172  read_from_file(read_fasta_file(), records, counter);
1173  break;
1174  }
1175  case FASTQ: {
1176  read_from_buffer(read_fastq_buffer(), records, counter);
1177  read_transition(read_fastq_transition(), records, counter);
1178  read_from_file(read_fastq_file(), records, counter);
1179  break;
1180  }
1181  case SAM: {
1182  read_from_buffer(read_sam_buffer(), records, counter);
1183  read_transition(read_sam_transition(), records, counter);
1184  read_from_file(read_sam_file(), records, counter);
1185  break;
1186  }
1187  case GFA2: {
1188  read_from_buffer(read_gfa2_buffer(), records, counter);
1189  read_transition(read_gfa2_transition(), records, counter);
1190  read_from_file(read_gfa2_file(), records, counter);
1191  break;
1192  }
1193  default: {
1194  break;
1195  }
1196  }
1197 
1198  reader_end = true;
1199  records.current = 0;
1200  records.index = counter++;
1201  size_t last_count = records.count;
1202  reader_queue.write(records);
1203  if (last_count > 0) {
1204  IndexQueueSPSC<RecordCString, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>::Block
1205  dummy;
1206  dummy.index = counter++;
1207  dummy.current = 0;
1208  dummy.count = 0;
1209  reader_queue.write(dummy);
1210  }
1211  });
1212 }
1213 
1214 inline void
1215 SeqReader::start_seq_copier()
1216 {
1217  seq_copier_thread = new std::thread([this]() {
1218  IndexQueueSPSC<RecordCString, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>::Block
1219  records_in;
1220  decltype(seq_copier_queue)::Block records_out;
1221  for (;;) {
1222  reader_queue.read(records_in);
1223  for (size_t i = 0; i < records_in.count; i++) {
1224  records_out.data[i].header = std::move(records_in.data[i].header);
1225  records_out.data[i].seq =
1226  std::string(records_in.data[i].seq.s, records_in.data[i].seq.size);
1227  records_out.data[i].qual = std::move(records_in.data[i].qual);
1228  auto& seq = records_out.data[i].seq;
1229  if (!seq.empty() && seq.back() == '\n') {
1230  seq.pop_back();
1231  }
1232  }
1233  records_out.count = records_in.count;
1234  records_out.current = records_in.current;
1235  records_out.index = records_in.index;
1236  if (records_out.count == 0) {
1237  seq_copier_queue.write(records_out);
1238  break;
1239  }
1240  seq_copier_queue.write(records_out);
1241  }
1242  });
1243 }
1244 
1245 inline void
1246 SeqReader::start_qual_copier()
1247 {
1248  qual_copier_thread = new std::thread([this]() {
1249  decltype(seq_copier_queue)::Block records_in;
1250  decltype(qual_copier_queue)::Block records_out;
1251  for (;;) {
1252  seq_copier_queue.read(records_in);
1253  for (size_t i = 0; i < records_in.count; i++) {
1254  records_out.data[i].header = std::move(records_in.data[i].header);
1255  records_out.data[i].seq = std::move(records_in.data[i].seq);
1256  records_out.data[i].qual =
1257  std::string(records_in.data[i].qual.s, records_in.data[i].qual.size);
1258  auto& qual = records_out.data[i].qual;
1259  if (!qual.empty() && qual.back() == '\n') {
1260  qual.pop_back();
1261  }
1262  }
1263  records_out.count = records_in.count;
1264  records_out.current = records_in.current;
1265  records_out.index = records_in.index;
1266  if (records_out.count == 0) {
1267  qual_copier_queue.write(records_out);
1268  break;
1269  }
1270  qual_copier_queue.write(records_out);
1271  }
1272  });
1273 }
1274 
1275 inline void
1276 SeqReader::start_postprocessor()
1277 {
1278  postprocessor_thread = new std::thread([this]() {
1279  decltype(qual_copier_queue)::Block records_in;
1280  decltype(postprocessor_queue)::Block records_out;
1281  for (;;) {
1282  qual_copier_queue.read(records_in);
1283  for (size_t i = 0; i < records_in.count; i++) {
1284  char* space = std::strstr(records_in.data[i].header, " ");
1285  size_t name_start = (format == FASTA || format == FASTQ) ? 1 : 0;
1286  if (space == nullptr) {
1287  records_out.data[i].name =
1288  std::string(records_in.data[i].header.s + name_start,
1289  records_in.data[i].header.size - name_start);
1290  records_out.data[i].comment = "";
1291  } else {
1292  records_out.data[i].name =
1293  std::string(records_in.data[i].header.s + name_start,
1294  space - records_in.data[i].header.s - name_start);
1295  records_out.data[i].comment =
1296  std::string(space + 1,
1297  records_in.data[i].header.size -
1298  (space - records_in.data[i].header.s) - 1);
1299  }
1300  records_in.data[i].header.clear();
1301  records_out.data[i].seq = std::move(records_in.data[i].seq);
1302  records_out.data[i].qual = std::move(records_in.data[i].qual);
1303  auto& name = records_out.data[i].name;
1304  auto& comment = records_out.data[i].comment;
1305  auto& seq = records_out.data[i].seq;
1306  auto& qual = records_out.data[i].qual;
1307  if (!name.empty() && name.back() == '\n') {
1308  name.pop_back();
1309  }
1310  if (!comment.empty() && comment.back() == '\n') {
1311  comment.pop_back();
1312  }
1313  if (flagTrimMasked()) {
1314  const auto len = seq.length();
1315  size_t trim_start = 0, trim_end = seq.length();
1316  while (trim_start <= len && bool(islower(seq[trim_start]))) {
1317  trim_start++;
1318  }
1319  while (trim_end > 0 && bool(islower(seq[trim_end - 1]))) {
1320  trim_end--;
1321  }
1322  seq.erase(trim_end);
1323  seq.erase(0, trim_start);
1324  if (!qual.empty()) {
1325  qual.erase(trim_end);
1326  qual.erase(0, trim_start);
1327  }
1328  }
1329  if (flagFoldCase()) {
1330  for (auto& c : seq) {
1331  char old = c;
1332  c = CAPITALS[unsigned(c)];
1333  if (!bool(c)) {
1334  log_error(std::string("A sequence contains invalid "
1335  "IUPAC character: ") +
1336  old);
1337  std::exit(EXIT_FAILURE);
1338  }
1339  }
1340  }
1341  records_out.data[i].num = records_in.index * RECORD_BLOCK_SIZE + i;
1342  }
1343  records_out.count = records_in.count;
1344  records_out.current = records_in.current;
1345  records_out.index = records_in.index;
1346  if (records_out.count == 0) {
1347  postprocessor_queue.write(records_out);
1348  break;
1349  }
1350  postprocessor_queue.write(records_out);
1351  }
1352  });
1353 }
1354 
1355 inline SeqReader::Record
1357 {
1358  auto& ready_records = ready_records_array()[id];
1359  auto& ready_record = ready_record_array()[id];
1360  if (ready_records.count <= ready_records.current) {
1361  postprocessor_queue.read(ready_records);
1362  if (ready_records.count <= ready_records.current) {
1363  close();
1364  return Record();
1365  }
1366  }
1367  ready_record = &(ready_records.data[ready_records.current++]);
1368  return std::move(*ready_record);
1369 }
1370 
1371 } // namespace btllib
1372 
1373 #endif
btllib::SeqReader::Record
Definition: seq_reader.hpp:59
btllib::SeqReader::read
Record read()
Definition: seq_reader.hpp:1356
btllib::SeqReader
Definition: seq_reader.hpp:25
btllib::SeqReader::FOLD_CASE
@ FOLD_CASE
Definition: seq_reader.hpp:30
btllib::SeqReader::NO_TRIM_MASKED
@ NO_TRIM_MASKED
Definition: seq_reader.hpp:34
btllib::SeqReader::Flag
Flag
Definition: seq_reader.hpp:28