-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcigar_parse.py
More file actions
93 lines (80 loc) · 3.58 KB
/
cigar_parse.py
File metadata and controls
93 lines (80 loc) · 3.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python3
#author: yangqi
#contact: yangqi2@grandomics.com
"""
cigar string is used to represent sequence alignment. It is compact.
cigar is used in sam format file and other files.
For more information about cigar or sam files see: https://samtools.github.io/hts-specs
Op BAM Description Consumes_query Consumes_reference
M 0 alignment match (can be a sequence match or mismatch) yes yes
I 1 insertion to the reference yes no
D 2 deletion from the reference no yes
N 3 skipped region from the reference no yes
S 4 soft clipping (clipped sequences present in SEQ) yes no
H 5 hard clipping (clipped sequences NOT present in SEQ) no no
P 6 padding (silent deletion from padded reference) no no
= 7 sequence match yes yes
X 8 sequence mismatch yes yes
aligned_len
ref ATTTGC-CGA
query ATT-GCCCGA
"""
class Cigar(object):
def __init__(self, cigar_string):
self.CIGAR = cigar_string
#consums mean step right ->
self.query_consumes = ["M","I","S","=","X"]
self.ref_consumes = ["M","D","N","=","X"]
num = ""
op = ""
self.cigar_ops = []
for c in self.CIGAR:
if c.isdigit():
num += c
else:
op = c
self.cigar_ops.append((int(num), op))
num = ""
op = ""
self.query_len = self._query_len()
self.query_mapped_len = self._query_mapped_len()
self.ref_len = self._ref_len()
self.aligned_len = self._aligned_len()
self.query_start = self._query_start()
self.query_end = self._query_end()
self.reversed_cigar = self._cigar_reverse()
def _query_len(self):
#query_len = num of hard-clip bases + num of query_consumes bases
query_len_operations = self.query_consumes + ["H"]
return sum([co[0] for co in self.cigar_ops \
if co[1] in query_len_operations])
def _query_mapped_len(self):
query_mapped_len_operations = ["M","I","=","X"]
return sum([co[0] for co in self.cigar_ops \
if co[1] in query_mapped_len_operations])
def _ref_len(self):
return sum([co[0] for co in self.cigar_ops \
if co[1] in self.ref_consumes])
def _aligned_len(self):
#query_mapped_len = except "H"s and "S"s
not_aligned_len_operations = ["H", "S"]
return sum([co[0] for co in self.cigar_ops \
if co[1] not in not_aligned_len_operations])
def _query_start(self):
#one based start
#"H" and "S" appear togather is not considered, eg "10H5S89M"
#suit for "10H89M" or "10S89M"
if self.cigar_ops[0][1] == "H" or self.cigar_ops[0][1] == "S":
return 1+self.cigar_ops[0][0]
else:
return 1
def _query_end(self):
#one based end
#"H" and "S" appear togather is not considered, eg "89M5S10H"
#suit for "89M10H" or "89M10S"
if self.cigar_ops[-1][1] == "H" or self.cigar_ops[-1][1] == "S":
return self._query_len()-self.cigar_ops[-1][0]
else:
return self._query_len()
def _cigar_reverse(self):
return "".join(["".join([str(co[0]),co[1]]) for co in self.cigar_ops][::-1])