Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions code/fileio.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import pandas as pd

def read_in_files(input_file, state_tree_file):
def read_in_files(input_file, state_tree_file, germline_mutations_file):
BC = pd.read_csv(input_file, index_col=0)
S = []
L = {}
mutations = set(sorted([v.split('_')[0] for v in BC.columns[1:]]))
SNP = []
mutations = set(sorted([v.rsplit('_',1)[0] for v in BC.columns[1:]]))
BC.index = map(str, BC.index)

with open(state_tree_file) as f:
Expand All @@ -16,8 +17,12 @@ def read_in_files(input_file, state_tree_file):
L[tuple(edge)] = line_split[2:]
except:
L[tuple(edge)] = []

return BC, S, L

with open(germline_mutations_file) as f:
contents = f.readlines()
if len(contents) > 0:
SNP = contents[0].strip().split(',')
return BC, S, L, SNP

def write_out_files(B, B_with_ancestors, T, filename, totalLL):
B.to_csv('{}.B'.format(filename))
Expand Down
38 changes: 28 additions & 10 deletions code/optimize_mutation_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,16 @@ def solve_model(C):
vals = []
# Create a new model
m = Model("mip1")
# Add Solver Time Limit
m.setParam('TimeLimit', 5*60*60)

Bs = {}
# Create mutation matrix
for p in C.index:
for a in C.columns:
Bs[(p,a)] = m.addVar(vtype=GRB.BINARY, name="b_{}_{}".format(p,a))
vals.append((p,a))

for i,a in enumerate(C.columns):
for b in C.columns[i+1:]:

Expand Down Expand Up @@ -70,16 +72,34 @@ def solve_model(C):

# Set objective
objn = sum(Bs[(p,a)]*C.loc[p][a] for p,a in vals)

m.setObjective(objn, GRB.MAXIMIZE)

m.optimize()

B = C.copy()

for v in m.getVars():
if v.varName.startswith('b'):
try:
p,a = v.varName.split('_')[1:]
'''
if v.varName.startswith('b_ANC:'):
p = '_'.join(v.varName.split('_')[1:3])
a = '_'.join(v.varName.split('_')[3:])
else:
p = '_'.join(v.varName.split('_')[1:4])
a = '_'.join(v.varName.split('_')[4:])
'''
if v.varName.startswith('b_ANC:'):
filter_var = v.varName.split(':')[1]
p = '_'.join(filter_var.split('_')[0:1])
a = '_'.join(filter_var.split('_')[1:])
if p.isdigit():
p = B.index[int(p)]
else:
filter_var = v.varName.split('_',1)[1]
p = '_'.join(filter_var.split('_')[0:3])
a = '_'.join(filter_var.split('_')[3:])
#p,a = v.varName.split('_')[1:]
except:
print(v.varName)
raise
Expand Down Expand Up @@ -143,15 +163,14 @@ def desc_scores(v):
print("----------------- DESCS", descs)
except KeyError:
descs=[]

for i,D in enumerate(descs):
child, desc = D
print(child)
#print(child)
d = pd.DataFrame([[desc_scores(desc[a]) for a in mixed_muts]], columns = mixed_muts,\
index = ['ANC:{}'.format(child)])
index = ['ANC:{}'.format(child)])
C = C.append(d)


#C = C.reset_index(drop = True)
return C

Expand Down Expand Up @@ -182,6 +201,7 @@ def get_descendent_profiles(sigmas, mutations, S, L):
#print("DPS")
#print(child, descendent_profile)


return DPs

def assemble_mutation_matrix(Bs,sigmas, BC, mutations):
Expand Down Expand Up @@ -259,8 +279,7 @@ def assemble_mutation_matrix_with_ancestors(Bs,sigmas, BC, mutations):
input_file = "test_data/BC.csv"
state_tree = "test_data/S.csv"
BC, S, L = read_in_files(input_file, state_tree)
mutations = set(sorted([v.split('_')[0] for v in BC.columns[1:]]))

mutations = set(sorted([v.rsplit('_', 1)[0] for v in BC.columns[1:]]))
sigmas = get_optimal_sigma(S,BC)

DPs = get_descendent_profiles(sigmas, mutations)
Expand All @@ -269,5 +288,4 @@ def assemble_mutation_matrix_with_ancestors(Bs,sigmas, BC, mutations):
C = calculate_C(1, sigmas, DPs)

B = solve_model(C)

#print(B)
26 changes: 17 additions & 9 deletions code/optimize_sigma.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@

from itertools import chain
from probmodels import log_prob_absent, log_prob_present, log_prob_mixed
import pandas as pd

###
# Calculates the optimal sigma assignments for all mutations given state tree S
###
def get_optimal_sigma(S, BC, L):
def get_optimal_sigma(S, BC, L, SNP):
"""
For all mutations calculates the optimal sigma assignment
S -- edgelist representation of copy-number state tree
Expand All @@ -19,11 +19,12 @@ def get_optimal_sigma(S, BC, L):
copy-number states, and entries in {'Absent', 'Present', 'Mixed'}

"""
mutations = set(sorted([v.split('_')[0] for v in BC.columns[1:]]))
mutations = set(sorted([v.rsplit('_', 1)[0] for v in BC.columns[1:]]))
C = list(BC['c'])
num_states = len(set(C))
#num_states = len(set(C))
num_states = len(set(chain(*S)))
subtrees = enum_all_subtrees(num_states, S)

print(subtrees)
sigmas = {}
print(L)

Expand All @@ -32,15 +33,17 @@ def get_optimal_sigma(S, BC, L):
max_value = float('-inf')
max_sigma = None
max_deletions = None

V = list(BC['{}_v'.format(mutation)])
T = list(BC['{}_t'.format(mutation)])

for subtree in subtrees:

status1 = ['Mixed' if i == subtree[0] else 'Present' if i in subtree[1:] else 'Absent' for i in range(num_states)]

# S is the edge list
# For every edge, I need to check that if I go from present to absent
# then there is an allowed loss

valid_tree = True
tree_deletions = []
for edge in S:
Expand All @@ -52,18 +55,22 @@ def get_optimal_sigma(S, BC, L):
valid_tree = False
else:
tree_deletions.append(('ANC:{}'.format(t), mutation))

if not valid_tree: continue

p1 = log_prob_sigma(V,T,C, status1)

if p1 > max_value:
max_value = p1
max_sigma = status1
max_deletions = tree_deletions

deletions += max_deletions
sigmas[mutation] = max_sigma

# Converted all SNP mutation 0 state sigma value to Present to account for Germline Mutations
for mutation in SNP:
if mutation in sigmas.keys():
sigmas[mutation][0] = 'Present'

sigma_new = pd.DataFrame(sigmas)

Expand Down Expand Up @@ -96,6 +103,7 @@ def log_prob_sigma(V,T,C,sigma):
# print('-------')
v,t,c = R


status = sigma[c]
if status == 'Absent':
log_prob += log_prob_absent(v,t)
Expand Down
7 changes: 4 additions & 3 deletions code/probmodels.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
import pandas as pd

PROB_SEQ_ERROR = 0.001
BETABINOM_ALPHA = 1.0
BETABINOM_BETA = 1.0
ADO_PRECISION = 15
BETABINOM_ALPHA = PROB_SEQ_ERROR * ADO_PRECISION
BETABINOM_BETA = (1 - PROB_SEQ_ERROR) * ADO_PRECISION

from scipy.stats import betabinom, binom
import math
# Calculating the optimal sigma

def log_prob_absent(v,t):
prob = binom.logpmf(v,t,PROB_SEQ_ERROR)
prob = betabinom.logpmf(v,t, BETABINOM_ALPHA, BETABINOM_BETA)
return prob

def log_prob_present(v,t):
Expand Down
21 changes: 14 additions & 7 deletions code/scarlet.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ def correct_ternary_matrix(B, S, BC, deletions):
for deletion in deletions:
print(" ----", deletion)
child, mut = deletion
#print('DELETION: ', deletion, child, mut)
child = int(child.replace('ANC:', ''))
# calculate the set of copy-number states affected by the deletion
# not the most efficient way to do this but S should be small
Expand All @@ -31,27 +32,33 @@ def correct_ternary_matrix(B, S, BC, deletions):
def main():
BC_file = sys.argv[1]
SL_file = sys.argv[2]
output_file = sys.argv[3]

#TODO: Add input file containing list of germline mutations (SNPs)
SNP_file = sys.argv[3]

output_file = sys.argv[4]

BC, S, L = read_in_files(BC_file, SL_file)
mutations = sorted(set([v.split('_')[0] for v in BC.columns[1:]]))
sigmas, dels = get_optimal_sigma(S,BC,L)
BC, S, L, SNP = read_in_files(BC_file, SL_file, SNP_file)

mutations = sorted(set([v.rsplit('_', 1)[0] for v in BC.columns[1:]]))
sigmas, dels = get_optimal_sigma(S,BC,L,SNP)
DPs = get_descendent_profiles(sigmas, mutations, S, L)

all_deletions = dels
cn_states = BC['c'].unique()
Bs = {}

for i in cn_states:

C= calculate_C(i, sigmas, DPs, BC)

B, deletions = solve_model(C)

Bs[i]=B
all_deletions += deletions

print("All DELETIONS", all_deletions)


result = assemble_mutation_matrix(Bs, sigmas, BC, mutations)
result_with_ancestors = assemble_mutation_matrix_with_ancestors(Bs, sigmas, BC, mutations)

Expand Down