diff --git a/code/fileio.py b/code/fileio.py index b6e4044..36fbe97 100644 --- a/code/fileio.py +++ b/code/fileio.py @@ -1,10 +1,11 @@ import pandas as pd -def read_in_files(input_file, state_tree_file): +def read_in_files(input_file, state_tree_file, germline_mutations_file): BC = pd.read_csv(input_file, index_col=0) S = [] L = {} - mutations = set(sorted([v.split('_')[0] for v in BC.columns[1:]])) + SNP = [] + mutations = set(sorted([v.rsplit('_',1)[0] for v in BC.columns[1:]])) BC.index = map(str, BC.index) with open(state_tree_file) as f: @@ -16,8 +17,12 @@ def read_in_files(input_file, state_tree_file): L[tuple(edge)] = line_split[2:] except: L[tuple(edge)] = [] - - return BC, S, L + + with open(germline_mutations_file) as f: + contents = f.readlines() + if len(contents) > 0: + SNP = contents[0].strip().split(',') + return BC, S, L, SNP def write_out_files(B, B_with_ancestors, T, filename, totalLL): B.to_csv('{}.B'.format(filename)) diff --git a/code/optimize_mutation_matrix.py b/code/optimize_mutation_matrix.py index 7289b6f..d482738 100644 --- a/code/optimize_mutation_matrix.py +++ b/code/optimize_mutation_matrix.py @@ -15,6 +15,8 @@ def solve_model(C): vals = [] # Create a new model m = Model("mip1") + # Add Solver Time Limit + m.setParam('TimeLimit', 5*60*60) Bs = {} # Create mutation matrix @@ -22,7 +24,7 @@ def solve_model(C): for a in C.columns: Bs[(p,a)] = m.addVar(vtype=GRB.BINARY, name="b_{}_{}".format(p,a)) vals.append((p,a)) - + for i,a in enumerate(C.columns): for b in C.columns[i+1:]: @@ -70,16 +72,34 @@ def solve_model(C): # Set objective objn = sum(Bs[(p,a)]*C.loc[p][a] for p,a in vals) - m.setObjective(objn, GRB.MAXIMIZE) m.optimize() B = C.copy() + for v in m.getVars(): if v.varName.startswith('b'): try: - p,a = v.varName.split('_')[1:] + ''' + if v.varName.startswith('b_ANC:'): + p = '_'.join(v.varName.split('_')[1:3]) + a = '_'.join(v.varName.split('_')[3:]) + else: + p = '_'.join(v.varName.split('_')[1:4]) + a = '_'.join(v.varName.split('_')[4:]) + ''' + if v.varName.startswith('b_ANC:'): + filter_var = v.varName.split(':')[1] + p = '_'.join(filter_var.split('_')[0:1]) + a = '_'.join(filter_var.split('_')[1:]) + if p.isdigit(): + p = B.index[int(p)] + else: + filter_var = v.varName.split('_',1)[1] + p = '_'.join(filter_var.split('_')[0:3]) + a = '_'.join(filter_var.split('_')[3:]) + #p,a = v.varName.split('_')[1:] except: print(v.varName) raise @@ -143,15 +163,14 @@ def desc_scores(v): print("----------------- DESCS", descs) except KeyError: descs=[] - + for i,D in enumerate(descs): child, desc = D - print(child) + #print(child) d = pd.DataFrame([[desc_scores(desc[a]) for a in mixed_muts]], columns = mixed_muts,\ - index = ['ANC:{}'.format(child)]) + index = ['ANC:{}'.format(child)]) C = C.append(d) - #C = C.reset_index(drop = True) return C @@ -182,6 +201,7 @@ def get_descendent_profiles(sigmas, mutations, S, L): #print("DPS") #print(child, descendent_profile) + return DPs def assemble_mutation_matrix(Bs,sigmas, BC, mutations): @@ -259,8 +279,7 @@ def assemble_mutation_matrix_with_ancestors(Bs,sigmas, BC, mutations): input_file = "test_data/BC.csv" state_tree = "test_data/S.csv" BC, S, L = read_in_files(input_file, state_tree) - mutations = set(sorted([v.split('_')[0] for v in BC.columns[1:]])) - + mutations = set(sorted([v.rsplit('_', 1)[0] for v in BC.columns[1:]])) sigmas = get_optimal_sigma(S,BC) DPs = get_descendent_profiles(sigmas, mutations) @@ -269,5 +288,4 @@ def assemble_mutation_matrix_with_ancestors(Bs,sigmas, BC, mutations): C = calculate_C(1, sigmas, DPs) B = solve_model(C) - #print(B) diff --git a/code/optimize_sigma.py b/code/optimize_sigma.py index 3728fc1..bba29a0 100644 --- a/code/optimize_sigma.py +++ b/code/optimize_sigma.py @@ -1,11 +1,11 @@ - +from itertools import chain from probmodels import log_prob_absent, log_prob_present, log_prob_mixed import pandas as pd ### # Calculates the optimal sigma assignments for all mutations given state tree S ### -def get_optimal_sigma(S, BC, L): +def get_optimal_sigma(S, BC, L, SNP): """ For all mutations calculates the optimal sigma assignment S -- edgelist representation of copy-number state tree @@ -19,11 +19,12 @@ def get_optimal_sigma(S, BC, L): copy-number states, and entries in {'Absent', 'Present', 'Mixed'} """ - mutations = set(sorted([v.split('_')[0] for v in BC.columns[1:]])) + mutations = set(sorted([v.rsplit('_', 1)[0] for v in BC.columns[1:]])) C = list(BC['c']) - num_states = len(set(C)) + #num_states = len(set(C)) + num_states = len(set(chain(*S))) subtrees = enum_all_subtrees(num_states, S) - + print(subtrees) sigmas = {} print(L) @@ -32,15 +33,17 @@ def get_optimal_sigma(S, BC, L): max_value = float('-inf') max_sigma = None max_deletions = None + V = list(BC['{}_v'.format(mutation)]) T = list(BC['{}_t'.format(mutation)]) + for subtree in subtrees: status1 = ['Mixed' if i == subtree[0] else 'Present' if i in subtree[1:] else 'Absent' for i in range(num_states)] - # S is the edge list # For every edge, I need to check that if I go from present to absent # then there is an allowed loss + valid_tree = True tree_deletions = [] for edge in S: @@ -52,18 +55,22 @@ def get_optimal_sigma(S, BC, L): valid_tree = False else: tree_deletions.append(('ANC:{}'.format(t), mutation)) - + if not valid_tree: continue p1 = log_prob_sigma(V,T,C, status1) - if p1 > max_value: max_value = p1 max_sigma = status1 max_deletions = tree_deletions - + deletions += max_deletions sigmas[mutation] = max_sigma + + # Converted all SNP mutation 0 state sigma value to Present to account for Germline Mutations + for mutation in SNP: + if mutation in sigmas.keys(): + sigmas[mutation][0] = 'Present' sigma_new = pd.DataFrame(sigmas) @@ -96,6 +103,7 @@ def log_prob_sigma(V,T,C,sigma): # print('-------') v,t,c = R + status = sigma[c] if status == 'Absent': log_prob += log_prob_absent(v,t) diff --git a/code/probmodels.py b/code/probmodels.py index e561cf6..4679058 100644 --- a/code/probmodels.py +++ b/code/probmodels.py @@ -1,15 +1,16 @@ import pandas as pd PROB_SEQ_ERROR = 0.001 -BETABINOM_ALPHA = 1.0 -BETABINOM_BETA = 1.0 +ADO_PRECISION = 15 +BETABINOM_ALPHA = PROB_SEQ_ERROR * ADO_PRECISION +BETABINOM_BETA = (1 - PROB_SEQ_ERROR) * ADO_PRECISION from scipy.stats import betabinom, binom import math # Calculating the optimal sigma def log_prob_absent(v,t): - prob = binom.logpmf(v,t,PROB_SEQ_ERROR) + prob = betabinom.logpmf(v,t, BETABINOM_ALPHA, BETABINOM_BETA) return prob def log_prob_present(v,t): diff --git a/code/scarlet.py b/code/scarlet.py index 4ef69c9..b0971c9 100644 --- a/code/scarlet.py +++ b/code/scarlet.py @@ -11,6 +11,7 @@ def correct_ternary_matrix(B, S, BC, deletions): for deletion in deletions: print(" ----", deletion) child, mut = deletion + #print('DELETION: ', deletion, child, mut) child = int(child.replace('ANC:', '')) # calculate the set of copy-number states affected by the deletion # not the most efficient way to do this but S should be small @@ -31,27 +32,33 @@ def correct_ternary_matrix(B, S, BC, deletions): def main(): BC_file = sys.argv[1] SL_file = sys.argv[2] - output_file = sys.argv[3] + + #TODO: Add input file containing list of germline mutations (SNPs) + SNP_file = sys.argv[3] + + output_file = sys.argv[4] - BC, S, L = read_in_files(BC_file, SL_file) - mutations = sorted(set([v.split('_')[0] for v in BC.columns[1:]])) - sigmas, dels = get_optimal_sigma(S,BC,L) + BC, S, L, SNP = read_in_files(BC_file, SL_file, SNP_file) + + mutations = sorted(set([v.rsplit('_', 1)[0] for v in BC.columns[1:]])) + sigmas, dels = get_optimal_sigma(S,BC,L,SNP) DPs = get_descendent_profiles(sigmas, mutations, S, L) all_deletions = dels cn_states = BC['c'].unique() Bs = {} + for i in cn_states: - + C= calculate_C(i, sigmas, DPs, BC) - + B, deletions = solve_model(C) + Bs[i]=B all_deletions += deletions print("All DELETIONS", all_deletions) - result = assemble_mutation_matrix(Bs, sigmas, BC, mutations) result_with_ancestors = assemble_mutation_matrix_with_ancestors(Bs, sigmas, BC, mutations)