1 #ifndef BTLLIB_SEQ_READER_HPP
2 #define BTLLIB_SEQ_READER_HPP
4 #include "data_stream.hpp"
5 #include "index_queue.hpp"
13 #include <condition_variable>
38 SeqReader(
const std::string& source_path,
int flags = 0);
41 void close() noexcept;
43 bool flagFoldCase()
const {
return bool(~flags & NO_FOLD_CASE); }
44 bool flagTrimMasked()
const {
return bool(flags & TRIM_MASKED); }
56 Format get_format()
const {
return format; }
66 operator bool()
const {
return !seq.empty(); }
73 const std::string& source_path;
76 Format format = UNDETERMINED;
79 static const size_t DETERMINE_FORMAT_CHARS = 2048;
80 static const size_t BUFFER_SIZE = DETERMINE_FORMAT_CHARS;
82 char* buffer =
nullptr;
83 size_t buffer_start = 0;
84 size_t buffer_end = 0;
85 bool eof_newline_inserted =
false;
87 static const size_t RECORD_QUEUE_SIZE = 32;
88 static const size_t RECORD_BLOCK_SIZE = 128;
90 static const size_t CSTRING_DEFAULT_CAP = 4096;
92 static const size_t MAX_SIMULTANEOUS_SEQREADERS = 256;
97 CString() { s[0] =
'\0'; }
98 CString(
const CString&) =
delete;
99 CString(CString&& cstring) noexcept
101 std::swap(s, cstring.s);
104 std::swap(cap, cstring.cap);
106 CString(
const std::string& str)
108 if (str.size() + 1 > cap) {
109 cap = str.size() + 1;
110 s = (
char*)std::realloc((
char*)s, cap);
113 memcpy(s, str.c_str(), size + 1);
116 CString& operator=(
const CString&) =
delete;
117 CString& operator=(CString&& cstring) noexcept
119 std::swap(s, cstring.s);
122 std::swap(cap, cstring.cap);
125 CString& operator=(
const std::string& str)
127 if (str.size() + 1 > cap) {
128 cap = str.size() + 1;
129 s = (
char*)std::realloc((
char*)s, cap);
132 memcpy(s, str.c_str(), size + 1);
136 ~CString() { free(s); }
143 bool empty()
const {
return (ssize_t)size <= 0; }
145 operator char*()
const {
return s; }
147 char* s = (
char*)std::malloc(CSTRING_DEFAULT_CAP);
149 size_t cap = CSTRING_DEFAULT_CAP;
155 RecordCString() =
default;
156 RecordCString(
const RecordCString&) =
delete;
157 RecordCString(RecordCString&& record) =
default;
159 RecordCString& operator=(
const RecordCString&) =
delete;
160 RecordCString& operator=(RecordCString&& record) =
default;
167 struct RecordCString2
170 RecordCString2() =
default;
171 RecordCString2(
const RecordCString2&) =
delete;
172 RecordCString2(RecordCString2&& record) =
default;
174 RecordCString2& operator=(
const RecordCString2&) =
delete;
175 RecordCString2& operator=(RecordCString2&& record) =
default;
182 struct RecordCString3
185 RecordCString3() =
default;
186 RecordCString3(
const RecordCString3&) =
delete;
187 RecordCString3(RecordCString3&& record) =
default;
189 RecordCString3& operator=(
const RecordCString3&) =
delete;
190 RecordCString3& operator=(RecordCString3&& record) =
default;
199 std::thread* reader_thread =
nullptr;
200 std::thread* seq_copier_thread =
nullptr;
201 std::thread* qual_copier_thread =
nullptr;
202 std::thread* postprocessor_thread =
nullptr;
203 std::mutex format_mutex;
204 std::condition_variable format_cv;
205 std::atomic<bool> reader_end;
206 RecordCString* reader_record =
nullptr;
207 IndexQueueSPSC<RecordCString, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>
209 IndexQueueSPMC<RecordCString2, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>
211 IndexQueueSPMC<RecordCString3, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>
213 IndexQueueSPMC<Record, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>
218 static IndexQueueSPMC<Record, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>::Block*
219 ready_records_array()
221 thread_local
static IndexQueueSPMC<Record,
223 RECORD_BLOCK_SIZE>::Block
224 _ready_records_array[MAX_SIMULTANEOUS_SEQREADERS];
225 return _ready_records_array;
229 static Record** ready_record_array()
231 thread_local
static Record*
232 _ready_record_array[MAX_SIMULTANEOUS_SEQREADERS];
233 return _ready_record_array;
237 static std::stack<unsigned>& recycled_ids() noexcept
239 static std::stack<unsigned> _recycled_ids;
240 return _recycled_ids;
244 static std::mutex& recycled_ids_mutex() noexcept
246 static std::mutex _recycled_ids_mutex;
247 return _recycled_ids_mutex;
251 static unsigned& last_id()
253 static unsigned _last_id = 0;
258 void recycle_id() const noexcept;
261 void determine_format();
263 void start_seq_copier();
264 void start_qual_copier();
265 void start_postprocessor();
269 bool is_fasta_buffer();
270 bool is_fastq_buffer();
271 bool is_sam_buffer();
272 bool is_gfa2_buffer();
274 bool readline_buffer_append(CString& s);
275 void readline_file(CString& s);
276 void readline_file_append(CString& s);
280 struct read_fasta_buffer;
281 struct read_fastq_buffer;
282 struct read_sam_buffer;
283 struct read_gfa2_buffer;
285 struct read_fasta_transition;
286 struct read_fastq_transition;
287 struct read_sam_transition;
288 struct read_gfa2_transition;
290 struct read_fasta_file;
291 struct read_fastq_file;
292 struct read_sam_file;
293 struct read_gfa2_file;
296 void read_from_buffer(
298 IndexQueueSPSC<RecordCString, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>::Block&
303 void read_transition(
305 IndexQueueSPSC<RecordCString, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>::Block&
312 IndexQueueSPSC<RecordCString, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>::Block&
319 inline SeqReader::SeqReader(const std::
string& source_path,
int flags)
320 : source_path(source_path)
321 , source(source_path)
325 buffer =
new char[BUFFER_SIZE];
329 start_postprocessor();
331 std::unique_lock<std::mutex> lock(format_mutex);
333 format_cv.wait(lock);
337 inline SeqReader::~SeqReader()
342 delete reader_thread;
343 delete seq_copier_thread;
344 delete qual_copier_thread;
345 delete postprocessor_thread;
349 SeqReader::generate_id()
351 std::unique_lock<std::mutex> lock(recycled_ids_mutex());
352 if (recycled_ids().empty()) {
355 id = recycled_ids().top();
356 recycled_ids().pop();
361 SeqReader::recycle_id() const noexcept
364 std::unique_lock<std::mutex> lock(recycled_ids_mutex());
365 recycled_ids().push(
id);
366 }
catch (
const std::exception& e) {
367 log_error(
"SeqReader id recycle error: " + std::string(e.what()));
368 std::exit(EXIT_FAILURE);
373 SeqReader::close() noexcept
379 postprocessor_queue.close();
380 postprocessor_thread->join();
381 qual_copier_queue.close();
382 qual_copier_thread->join();
383 seq_copier_queue.close();
384 seq_copier_thread->join();
385 reader_queue.close();
386 reader_thread->join();
388 }
catch (
const std::system_error& e) {
389 log_error(
"SeqReader thread join failure: " + std::string(e.what()));
390 std::exit(EXIT_FAILURE);
396 SeqReader::load_buffer()
399 char last = buffer_end > 0 ? buffer[buffer_end - 1] : char(0);
403 fread(buffer + buffer_end, 1, BUFFER_SIZE - buffer_end, source);
404 }
while (buffer_end < BUFFER_SIZE && !
bool(std::feof(source)));
406 if (
bool(std::feof(source)) && !eof_newline_inserted) {
407 if (buffer_end < BUFFER_SIZE) {
408 if ((buffer_end == 0 && last !=
'\n') ||
409 (buffer_end > 0 && buffer[buffer_end - 1] !=
'\n')) {
410 buffer[buffer_end++] =
'\n';
412 eof_newline_inserted =
true;
413 }
else if (buffer[BUFFER_SIZE - 1] ==
'\n') {
414 eof_newline_inserted =
true;
418 return bool(buffer_end);
422 SeqReader::is_fasta_buffer()
424 size_t current = buffer_start;
432 State state = IN_HEADER_1;
433 while (current < buffer_start + DETERMINE_FORMAT_CHARS &&
434 current < buffer_end) {
452 }
else if (!
bool(COMPLEMENTS[c])) {
463 SeqReader::is_fastq_buffer()
465 size_t current = buffer_start;
476 State state = IN_HEADER_1;
477 while (current < buffer_start + DETERMINE_FORMAT_CHARS &&
478 current < buffer_end) {
496 }
else if (!
bool(COMPLEMENTS[c])) {
515 }
else if (c < '!' || c >
'~') {
526 SeqReader::is_sam_buffer()
543 size_t current = buffer_start;
545 while (current < buffer_start + DETERMINE_FORMAT_CHARS &&
546 current < buffer_end && buffer[current] ==
'@') {
547 while (current < buffer_start + DETERMINE_FORMAT_CHARS &&
548 current < buffer_end && buffer[current] !=
'\n') {
556 while (current < buffer_start + DETERMINE_FORMAT_CHARS &&
557 current < buffer_end) {
563 if (current > 0 && !
bool(std::isspace(buffer[current - 1]))) {
569 switch (Column(column)) {
571 if (
bool(std::isspace(c))) {
576 if (!
bool(std::isdigit(c))) {
581 if (
bool(std::isspace(c))) {
586 if (!
bool(std::isdigit(c))) {
591 if (!
bool(std::isdigit(c))) {
596 if (
bool(std::isspace(c))) {
601 if (
bool(std::isspace(c))) {
606 if (!
bool(std::isdigit(c))) {
611 if (!
bool(std::isdigit(c))) {
616 if (!
bool(COMPLEMENTS[c])) {
621 if (
bool(std::isspace(c))) {
632 return current >= buffer_end || column >= QUAL;
636 SeqReader::is_gfa2_buffer()
638 const unsigned char specs[] = {
'H',
'S',
'F',
'E',
'G',
'O',
'U' };
648 auto is_a_spec = [&](
unsigned char c) {
650 for (
unsigned char spec : specs) {
659 State state = is_a_spec(buffer[0]) ? IN_ID : IN_IGNORED;
661 size_t current = buffer_start;
663 while (current < buffer_start + DETERMINE_FORMAT_CHARS &&
664 current < buffer_end) {
682 if (current + 1 < buffer_end) {
683 state = is_a_spec(buffer[current + 1]) ? IN_ID : IN_IGNORED;
689 if (current + 1 < buffer_end) {
690 state = is_a_spec(buffer[current + 1]) ? IN_ID : IN_IGNORED;
704 SeqReader::determine_format()
707 bool empty = buffer_end - buffer_start == 1;
708 check_warning(empty, std::string(source_path) +
" is empty.");
714 if (is_fasta_buffer()) {
715 format = Format::FASTA;
716 }
else if (is_fastq_buffer()) {
717 format = Format::FASTQ;
718 }
else if (is_sam_buffer()) {
719 format = Format::SAM;
720 }
else if (is_gfa2_buffer()) {
721 format = Format::GFA2;
723 format = Format::INVALID;
724 log_error(std::string(source_path) +
" source file is in invalid format!");
725 std::exit(EXIT_FAILURE);
730 SeqReader::readline_buffer_append(CString& s)
733 for (; buffer_start < buffer_end && (c = buffer[buffer_start]) !=
'\n';
735 if (s.size >= s.cap) {
737 s.s = (
char*)std::realloc((
char*)(s.s), s.cap);
741 if (s.size >= s.cap) {
743 s.s = (
char*)std::realloc((
char*)(s.s), s.cap);
754 SeqReader::readline_file(CString& s)
756 s.size = getline(&(s.s), &(s.cap), source);
760 SeqReader::readline_file_append(CString& s)
763 if (s.size + tmp.size + 1 > s.cap) {
764 s.cap = s.size + tmp.size + 1;
765 s.s = (
char*)std::realloc((
char*)(s.s), s.cap);
767 memcpy(s.s + s.size, tmp.s, tmp.size + 1);
772 #define READ_SAM(READLINE_SECTION, MIDEND_SECTION, END_SECTION) \
789 std::string tmp_string = seq_reader.tmp.s; \
790 if (tmp_string.length() > 0 && tmp_string[0] != '@') { \
791 size_t pos = 0, pos2 = 0, pos3 = 0; \
792 pos2 = tmp_string.find('\t'); \
793 if (tmp_string.size() + 1 > seq_reader.reader_record->header.cap) { \
794 seq_reader.reader_record->header.cap = tmp_string.size() + 1; \
795 seq_reader.reader_record->header.s = \
796 (char*)std::realloc((char*)(seq_reader.reader_record->header), \
797 seq_reader.reader_record->header.cap); \
799 seq_reader.reader_record->header = tmp_string.substr(0, pos2); \
800 for (int i = 0; i < int(SEQ) - 1; i++) { \
801 pos = tmp_string.find('\t', pos + 1); \
803 pos2 = tmp_string.find('\t', pos + 1); \
804 pos3 = tmp_string.find('\t', pos2 + 1); \
805 if (pos3 == std::string::npos) { \
806 pos3 = tmp_string.length(); \
808 if (tmp_string.size() + 1 > seq_reader.reader_record->seq.cap) { \
809 seq_reader.reader_record->seq.cap = tmp_string.size() + 1; \
810 seq_reader.reader_record->seq.s = \
811 (char*)std::realloc((char*)(seq_reader.reader_record->seq.s), \
812 seq_reader.reader_record->seq.cap); \
814 if (tmp_string.size() + 1 > seq_reader.reader_record->qual.cap) { \
815 seq_reader.reader_record->qual.cap = tmp_string.size() + 1; \
816 seq_reader.reader_record->qual.s = \
817 (char*)std::realloc((char*)(seq_reader.reader_record->qual.s), \
818 seq_reader.reader_record->qual.cap); \
820 seq_reader.reader_record->seq = \
821 tmp_string.substr(pos + 1, pos2 - pos - 1); \
822 seq_reader.reader_record->qual = \
823 tmp_string.substr(pos2 + 1, pos3 - pos2 - 1); \
826 seq_reader.tmp.clear(); \
831 #define READ_GFA2(READLINE_SECTION, MIDEND_SECTION, END_SECTION) \
841 std::string tmp_string = seq_reader.tmp.s; \
842 if (tmp_string.length() > 0 && tmp_string[0] == 'S') { \
843 size_t pos = 0, pos2 = 0; \
844 pos2 = tmp_string.find('\t', 1); \
845 if (tmp_string.size() + 1 > seq_reader.reader_record->header.cap) { \
846 seq_reader.reader_record->header.cap = tmp_string.size() + 1; \
847 seq_reader.reader_record->header.s = \
848 (char*)std::realloc((char*)(seq_reader.reader_record->header.s), \
849 seq_reader.reader_record->header.cap); \
851 seq_reader.reader_record->header = tmp_string.substr(1, pos2 - 1); \
852 for (int i = 0; i < int(SEQ) - 1; i++) { \
853 pos = tmp_string.find('\t', pos + 1); \
855 pos2 = tmp_string.find('\t', pos + 1); \
856 if (pos2 == std::string::npos) { \
857 pos2 = tmp_string.length(); \
859 if (tmp_string.size() + 1 > seq_reader.reader_record->seq.cap) { \
860 seq_reader.reader_record->seq.cap = tmp_string.size() + 1; \
861 seq_reader.reader_record->seq.s = \
862 (char*)std::realloc((char*)(seq_reader.reader_record->seq.s), \
863 seq_reader.reader_record->seq.cap); \
865 seq_reader.reader_record->seq = \
866 tmp_string.substr(pos + 1, pos2 - pos - 1); \
869 seq_reader.tmp.clear(); \
873 struct SeqReader::read_fasta_buffer
875 bool operator()(SeqReader& seq_reader)
877 switch (seq_reader.read_stage) {
879 if (!seq_reader.readline_buffer_append(
880 seq_reader.reader_record->header)) {
883 ++seq_reader.read_stage;
887 if (!seq_reader.readline_buffer_append(seq_reader.reader_record->seq)) {
890 seq_reader.read_stage = 0;
898 struct SeqReader::read_fastq_buffer
900 bool operator()(SeqReader& seq_reader)
902 switch (seq_reader.read_stage) {
904 if (!seq_reader.readline_buffer_append(
905 seq_reader.reader_record->header)) {
908 ++seq_reader.read_stage;
912 if (!seq_reader.readline_buffer_append(seq_reader.reader_record->seq)) {
915 ++seq_reader.read_stage;
919 if (!seq_reader.readline_buffer_append(seq_reader.tmp)) {
922 ++seq_reader.read_stage;
923 seq_reader.tmp.clear();
927 if (!seq_reader.readline_buffer_append(
928 seq_reader.reader_record->qual)) {
931 seq_reader.read_stage = 0;
939 struct SeqReader::read_sam_buffer
941 bool operator()(SeqReader& seq_reader)
944 if (!seq_reader.readline_buffer_append(
945 seq_reader.tmp)) { return false; },
946 seq_reader.tmp.clear();
949 if (seq_reader.buffer_start >= seq_reader.buffer_end) {
955 struct SeqReader::read_gfa2_buffer
957 bool operator()(SeqReader& seq_reader)
960 if (!seq_reader.readline_buffer_append(
961 seq_reader.tmp)) { return false; },
962 seq_reader.tmp.clear();
965 if (seq_reader.buffer_start >= seq_reader.buffer_end) {
971 struct SeqReader::read_fasta_transition
973 void operator()(SeqReader& seq_reader)
975 switch (seq_reader.read_stage) {
977 seq_reader.readline_file_append(seq_reader.reader_record->header);
978 ++seq_reader.read_stage;
982 seq_reader.readline_file_append(seq_reader.reader_record->seq);
983 seq_reader.read_stage = 0;
989 struct SeqReader::read_fastq_transition
991 void operator()(SeqReader& seq_reader)
993 switch (seq_reader.read_stage) {
995 seq_reader.readline_file_append(seq_reader.reader_record->header);
996 ++seq_reader.read_stage;
1000 seq_reader.readline_file_append(seq_reader.reader_record->seq);
1001 ++seq_reader.read_stage;
1005 seq_reader.readline_file_append(seq_reader.tmp);
1006 ++seq_reader.read_stage;
1007 seq_reader.tmp.clear();
1011 seq_reader.readline_file_append(seq_reader.reader_record->qual);
1012 seq_reader.read_stage = 0;
1018 struct SeqReader::read_sam_transition
1020 void operator()(SeqReader& seq_reader)
1023 seq_reader.readline_file_append(seq_reader.tmp);
1024 , , if (
bool(feof(seq_reader.source))) { break; })
1028 struct SeqReader::read_gfa2_transition
1030 void operator()(SeqReader& seq_reader)
1033 seq_reader.readline_file_append(seq_reader.tmp);
1034 , , if (
bool(feof(seq_reader.source))) { break; })
1038 struct SeqReader::read_fasta_file
1040 void operator()(SeqReader& seq_reader)
1042 seq_reader.readline_file(seq_reader.reader_record->header);
1043 seq_reader.readline_file(seq_reader.reader_record->seq);
1047 struct SeqReader::read_fastq_file
1049 void operator()(SeqReader& seq_reader)
1051 seq_reader.readline_file(seq_reader.reader_record->header);
1052 seq_reader.readline_file(seq_reader.reader_record->seq);
1053 seq_reader.readline_file(seq_reader.tmp);
1054 seq_reader.readline_file(seq_reader.reader_record->qual);
1058 struct SeqReader::read_sam_file
1060 void operator()(SeqReader& seq_reader)
1063 seq_reader.readline_file(seq_reader.tmp);
1064 , , if (
bool(feof(seq_reader.source))) { break; })
1068 struct SeqReader::read_gfa2_file
1070 void operator()(SeqReader& seq_reader)
1073 seq_reader.readline_file(seq_reader.tmp);
1074 , , if (
bool(feof(seq_reader.source))) { break; })
1078 template<
typename F>
1080 SeqReader::read_from_buffer(
1082 IndexQueueSPSC<RecordCString, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>::Block&
1086 for (; buffer_start < buffer_end && !reader_end;) {
1087 reader_record = &(records.data[records.count]);
1088 if (!f(*
this) || reader_record->seq.empty()) {
1092 if (records.count == RECORD_BLOCK_SIZE) {
1093 records.current = 0;
1094 records.index = counter++;
1095 reader_queue.write(records);
1096 records.current = 0;
1102 template<
typename F>
1104 SeqReader::read_transition(
1106 IndexQueueSPSC<RecordCString, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>::Block&
1110 if (std::ferror(source) == 0 && std::feof(source) == 0 && !reader_end) {
1111 int p = std::fgetc(source);
1113 std::ungetc(p, source);
1114 reader_record = &(records.data[records.count]);
1116 if (!reader_record->seq.empty()) {
1118 if (records.count == RECORD_BLOCK_SIZE) {
1119 records.current = 0;
1120 records.index = counter++;
1121 reader_queue.write(records);
1122 records.current = 0;
1130 template<
typename F>
1132 SeqReader::read_from_file(
1134 IndexQueueSPSC<RecordCString, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>::Block&
1138 for (; std::ferror(source) == 0 && std::feof(source) == 0 && !reader_end;) {
1139 reader_record = &(records.data[records.count]);
1141 if (reader_record->seq.empty()) {
1145 if (records.count == RECORD_BLOCK_SIZE) {
1146 records.current = 0;
1147 records.index = counter++;
1148 reader_queue.write(records);
1149 records.current = 0;
1156 SeqReader::start_reader()
1158 reader_thread =
new std::thread([
this]() {
1160 std::unique_lock<std::mutex> lock(format_mutex);
1162 format_cv.notify_all();
1166 IndexQueueSPSC<RecordCString, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>::Block
1170 read_from_buffer(read_fasta_buffer(), records, counter);
1171 read_transition(read_fasta_transition(), records, counter);
1172 read_from_file(read_fasta_file(), records, counter);
1176 read_from_buffer(read_fastq_buffer(), records, counter);
1177 read_transition(read_fastq_transition(), records, counter);
1178 read_from_file(read_fastq_file(), records, counter);
1182 read_from_buffer(read_sam_buffer(), records, counter);
1183 read_transition(read_sam_transition(), records, counter);
1184 read_from_file(read_sam_file(), records, counter);
1188 read_from_buffer(read_gfa2_buffer(), records, counter);
1189 read_transition(read_gfa2_transition(), records, counter);
1190 read_from_file(read_gfa2_file(), records, counter);
1199 records.current = 0;
1200 records.index = counter++;
1201 size_t last_count = records.count;
1202 reader_queue.write(records);
1203 if (last_count > 0) {
1204 IndexQueueSPSC<RecordCString, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>::Block
1206 dummy.index = counter++;
1209 reader_queue.write(dummy);
1215 SeqReader::start_seq_copier()
1217 seq_copier_thread =
new std::thread([
this]() {
1218 IndexQueueSPSC<RecordCString, RECORD_QUEUE_SIZE, RECORD_BLOCK_SIZE>::Block
1220 decltype(seq_copier_queue)::Block records_out;
1222 reader_queue.read(records_in);
1223 for (
size_t i = 0; i < records_in.count; i++) {
1224 records_out.data[i].header = std::move(records_in.data[i].header);
1225 records_out.data[i].seq =
1226 std::string(records_in.data[i].seq.s, records_in.data[i].seq.size);
1227 records_out.data[i].qual = std::move(records_in.data[i].qual);
1228 auto& seq = records_out.data[i].seq;
1229 if (!seq.empty() && seq.back() ==
'\n') {
1233 records_out.count = records_in.count;
1234 records_out.current = records_in.current;
1235 records_out.index = records_in.index;
1236 if (records_out.count == 0) {
1237 seq_copier_queue.write(records_out);
1240 seq_copier_queue.write(records_out);
1246 SeqReader::start_qual_copier()
1248 qual_copier_thread =
new std::thread([
this]() {
1249 decltype(seq_copier_queue)::Block records_in;
1250 decltype(qual_copier_queue)::Block records_out;
1252 seq_copier_queue.read(records_in);
1253 for (
size_t i = 0; i < records_in.count; i++) {
1254 records_out.data[i].header = std::move(records_in.data[i].header);
1255 records_out.data[i].seq = std::move(records_in.data[i].seq);
1256 records_out.data[i].qual =
1257 std::string(records_in.data[i].qual.s, records_in.data[i].qual.size);
1258 auto& qual = records_out.data[i].qual;
1259 if (!qual.empty() && qual.back() ==
'\n') {
1263 records_out.count = records_in.count;
1264 records_out.current = records_in.current;
1265 records_out.index = records_in.index;
1266 if (records_out.count == 0) {
1267 qual_copier_queue.write(records_out);
1270 qual_copier_queue.write(records_out);
1276 SeqReader::start_postprocessor()
1278 postprocessor_thread =
new std::thread([
this]() {
1279 decltype(qual_copier_queue)::Block records_in;
1280 decltype(postprocessor_queue)::Block records_out;
1282 qual_copier_queue.read(records_in);
1283 for (
size_t i = 0; i < records_in.count; i++) {
1284 char* space = std::strstr(records_in.data[i].header,
" ");
1285 size_t name_start = (format == FASTA || format == FASTQ) ? 1 : 0;
1286 if (space ==
nullptr) {
1287 records_out.data[i].name =
1288 std::string(records_in.data[i].header.s + name_start,
1289 records_in.data[i].header.size - name_start);
1290 records_out.data[i].comment =
"";
1292 records_out.data[i].name =
1293 std::string(records_in.data[i].header.s + name_start,
1294 space - records_in.data[i].header.s - name_start);
1295 records_out.data[i].comment =
1296 std::string(space + 1,
1297 records_in.data[i].header.size -
1298 (space - records_in.data[i].header.s) - 1);
1300 records_in.data[i].header.clear();
1301 records_out.data[i].seq = std::move(records_in.data[i].seq);
1302 records_out.data[i].qual = std::move(records_in.data[i].qual);
1303 auto& name = records_out.data[i].name;
1304 auto& comment = records_out.data[i].comment;
1305 auto& seq = records_out.data[i].seq;
1306 auto& qual = records_out.data[i].qual;
1307 if (!name.empty() && name.back() ==
'\n') {
1310 if (!comment.empty() && comment.back() ==
'\n') {
1313 if (flagTrimMasked()) {
1314 const auto len = seq.length();
1315 size_t trim_start = 0, trim_end = seq.length();
1316 while (trim_start <= len &&
bool(islower(seq[trim_start]))) {
1319 while (trim_end > 0 &&
bool(islower(seq[trim_end - 1]))) {
1322 seq.erase(trim_end);
1323 seq.erase(0, trim_start);
1324 if (!qual.empty()) {
1325 qual.erase(trim_end);
1326 qual.erase(0, trim_start);
1329 if (flagFoldCase()) {
1330 for (
auto& c : seq) {
1332 c = CAPITALS[unsigned(c)];
1334 log_error(std::string(
"A sequence contains invalid "
1335 "IUPAC character: ") +
1337 std::exit(EXIT_FAILURE);
1341 records_out.data[i].num = records_in.index * RECORD_BLOCK_SIZE + i;
1343 records_out.count = records_in.count;
1344 records_out.current = records_in.current;
1345 records_out.index = records_in.index;
1346 if (records_out.count == 0) {
1347 postprocessor_queue.write(records_out);
1350 postprocessor_queue.write(records_out);
1355 inline SeqReader::Record
1358 auto& ready_records = ready_records_array()[id];
1359 auto& ready_record = ready_record_array()[id];
1360 if (ready_records.count <= ready_records.current) {
1361 postprocessor_queue.read(ready_records);
1362 if (ready_records.count <= ready_records.current) {
1367 ready_record = &(ready_records.data[ready_records.current++]);
1368 return std::move(*ready_record);