-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathShuffleReads.cpp
More file actions
59 lines (49 loc) · 1.19 KB
/
ShuffleReads.cpp
File metadata and controls
59 lines (49 loc) · 1.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#include "htslib/kseq.h"
#include "htslib/sam.h"
#include "htslib/hts.h"
#include <pthread.h>
#include <semaphore.h>
#include <zlib.h>
KSEQ_INIT(gzFile, gzread)
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <algorithm>
#include <vector>
using namespace std;
int main(int argc, char* argv[]) {
gzFile fastaFile;
kseq_t *ks;
fastaFile = gzopen(argv[1], "r");
ks = kseq_init(fastaFile);
vector<string*> reads, names;
long space=0;
long iter=0;
long n=0;
vector<int> indices;
while (kseq_read(ks) >= 0) { // each kseq_read() call reads one query sequence
string *strPtr=new string;
strPtr->assign(ks->seq.s, ks->seq.l);
reads.push_back(strPtr);
space+= ks->seq.l;
iter+= ks->seq.l;
indices.push_back(n);
n+=1;
if (iter > 100000000) {
iter=0;
cerr << space << "\t" << n << endl;
}
string *namePtr = new string;
namePtr->assign(ks->name.s, ks->name.l);
names.push_back(namePtr);
}
ofstream out(argv[2]);
std::random_shuffle(indices.begin(), indices.end());
for (int i =0; i < indices.size(); i++ ) {
if (reads[indices[i]]->size() > 0) {
out << ">" << *names[indices[i]] << endl;
out << *reads[indices[i]] << endl;
}
}
}