raphael-group · Bolladeen · Nov 7, 2022 · Dec 8, 2022 · Dec 13, 2022 · Feb 21, 2023
diff --git a/code/fileio.py b/code/fileio.py
@@ -1,10 +1,11 @@
 import pandas as pd
 
-def read_in_files(input_file, state_tree_file):
+def read_in_files(input_file, state_tree_file, germline_mutations_file):
     BC = pd.read_csv(input_file, index_col=0)
     S = []
     L = {}
-    mutations = set(sorted([v.split('_')[0] for v in BC.columns[1:]]))
+    SNP = []
+    mutations = set(sorted([v.rsplit('_',1)[0] for v in BC.columns[1:]]))
     BC.index = map(str, BC.index)
 
     with open(state_tree_file) as f:
@@ -16,8 +17,12 @@ def read_in_files(input_file, state_tree_file):
                 L[tuple(edge)] = line_split[2:]
             except:
                 L[tuple(edge)] = []
-
-    return BC, S, L
+
+    with open(germline_mutations_file) as f:
+        contents = f.readlines()
+        if len(contents) > 0:
+            SNP = contents[0].strip().split(',')
+    return BC, S, L, SNP
 
 def write_out_files(B, B_with_ancestors, T, filename, totalLL):
     B.to_csv('{}.B'.format(filename))

diff --git a/code/optimize_mutation_matrix.py b/code/optimize_mutation_matrix.py
@@ -15,14 +15,16 @@ def solve_model(C):
         vals = []
         # Create a new model
         m = Model("mip1")
+        # Add Solver Time Limit
+        m.setParam('TimeLimit', 5*60*60)
 
         Bs = {}
         # Create mutation matrix
         for p in C.index:
             for a in C.columns:
                 Bs[(p,a)] = m.addVar(vtype=GRB.BINARY, name="b_{}_{}".format(p,a))
                 vals.append((p,a))
-
+        
         for i,a in enumerate(C.columns):
             for b in C.columns[i+1:]:
 
@@ -70,16 +72,34 @@ def solve_model(C):
 
         # Set objective
         objn = sum(Bs[(p,a)]*C.loc[p][a] for p,a in vals)
-
         m.setObjective(objn, GRB.MAXIMIZE)
 
         m.optimize()
 
         B = C.copy()
+
         for v in m.getVars():
             if v.varName.startswith('b'):
                 try:
-                    p,a = v.varName.split('_')[1:]
+                    ''' 
+                    if v.varName.startswith('b_ANC:'):                                                                                                   
+                        p = '_'.join(v.varName.split('_')[1:3])                                                                                          
+                        a = '_'.join(v.varName.split('_')[3:])                                                                                           
+                     else:                                                                                                                                
+                        p = '_'.join(v.varName.split('_')[1:4])                                                                                          
+                        a = '_'.join(v.varName.split('_')[4:])       
+                    '''
+                    if v.varName.startswith('b_ANC:'):
+                        filter_var = v.varName.split(':')[1]
+                        p = '_'.join(filter_var.split('_')[0:1])
+                        a = '_'.join(filter_var.split('_')[1:])
+                        if p.isdigit():
+                            p = B.index[int(p)]
+                    else:
+                        filter_var = v.varName.split('_',1)[1]
+                        p = '_'.join(filter_var.split('_')[0:3])
+                        a = '_'.join(filter_var.split('_')[3:]) 
+                        #p,a = v.varName.split('_')[1:]
                 except:
                     print(v.varName)
                     raise
@@ -143,15 +163,14 @@ def desc_scores(v):
         print("----------------- DESCS", descs)
     except KeyError:
         descs=[]
-
+    
     for i,D in enumerate(descs):
         child, desc = D
-        print(child)
+        #print(child)
         d = pd.DataFrame([[desc_scores(desc[a]) for a in mixed_muts]], columns = mixed_muts,\
-            index = ['ANC:{}'.format(child)])
+                index = ['ANC:{}'.format(child)])
         C = C.append(d)
 
-
     #C = C.reset_index(drop = True)
     return C
 
@@ -182,6 +201,7 @@ def get_descendent_profiles(sigmas, mutations, S, L):
         #print("DPS")
         #print(child, descendent_profile)
 
+
     return DPs 
 
 def assemble_mutation_matrix(Bs,sigmas, BC, mutations):
@@ -259,8 +279,7 @@ def assemble_mutation_matrix_with_ancestors(Bs,sigmas, BC, mutations):
     input_file = "test_data/BC.csv"
     state_tree = "test_data/S.csv"
     BC, S, L = read_in_files(input_file, state_tree)
-    mutations = set(sorted([v.split('_')[0] for v in BC.columns[1:]]))
-
+    mutations = set(sorted([v.rsplit('_', 1)[0] for v in BC.columns[1:]]))
     sigmas = get_optimal_sigma(S,BC)
 
     DPs = get_descendent_profiles(sigmas, mutations)
@@ -269,5 +288,4 @@ def assemble_mutation_matrix_with_ancestors(Bs,sigmas, BC, mutations):
     C = calculate_C(1, sigmas, DPs)
 
     B = solve_model(C)
-
     #print(B)
diff --git a/code/optimize_sigma.py b/code/optimize_sigma.py
@@ -1,11 +1,11 @@
-
+from itertools import chain
 from probmodels import log_prob_absent, log_prob_present, log_prob_mixed
 import pandas as pd
 
 ###
 #   Calculates the optimal sigma assignments for all mutations given state tree S
 ###
-def get_optimal_sigma(S, BC, L):
+def get_optimal_sigma(S, BC, L, SNP):
     """
     For all mutations calculates the optimal sigma assignment
     S -- edgelist representation of copy-number state tree
@@ -19,11 +19,12 @@ def get_optimal_sigma(S, BC, L):
     copy-number states, and entries in {'Absent', 'Present', 'Mixed'}
 
     """
-    mutations = set(sorted([v.split('_')[0] for v in BC.columns[1:]]))
+    mutations = set(sorted([v.rsplit('_', 1)[0] for v in BC.columns[1:]]))
     C = list(BC['c'])
-    num_states = len(set(C))
+    #num_states = len(set(C))
+    num_states = len(set(chain(*S)))
     subtrees = enum_all_subtrees(num_states, S)
-
+    print(subtrees)
     sigmas = {}
     print(L)
 
@@ -32,15 +33,17 @@ def get_optimal_sigma(S, BC, L):
         max_value = float('-inf')
         max_sigma = None
         max_deletions = None
+
         V = list(BC['{}_v'.format(mutation)])
         T = list(BC['{}_t'.format(mutation)])
+
         for subtree in subtrees:
 
             status1 = ['Mixed' if i == subtree[0] else 'Present' if i in subtree[1:] else 'Absent' for i in range(num_states)]
-
             # S is the edge list
             # For every edge, I need to check that if I go from present to absent
             # then there is an allowed loss
+
             valid_tree = True
             tree_deletions = []
             for edge in S:
@@ -52,18 +55,22 @@ def get_optimal_sigma(S, BC, L):
                         valid_tree = False
                     else:
                         tree_deletions.append(('ANC:{}'.format(t), mutation))
-                    
+
             if not valid_tree: continue
 
             p1 = log_prob_sigma(V,T,C, status1)
-
             if p1 > max_value:
                 max_value = p1
                 max_sigma = status1
                 max_deletions = tree_deletions
-
+        
         deletions += max_deletions
         sigmas[mutation] = max_sigma
+
+    # Converted all SNP mutation 0 state sigma value to Present to account for Germline Mutations
+    for mutation in SNP:
+        if mutation in sigmas.keys():
+            sigmas[mutation][0] = 'Present'
 
     sigma_new = pd.DataFrame(sigmas)
 
@@ -96,6 +103,7 @@ def log_prob_sigma(V,T,C,sigma):
         # print('-------')
         v,t,c = R
 
+
         status = sigma[c]
         if status == 'Absent':
             log_prob += log_prob_absent(v,t)

diff --git a/code/probmodels.py b/code/probmodels.py
@@ -1,15 +1,16 @@
 import pandas as pd
 
 PROB_SEQ_ERROR = 0.001
-BETABINOM_ALPHA = 1.0
-BETABINOM_BETA = 1.0
+ADO_PRECISION = 15
+BETABINOM_ALPHA = PROB_SEQ_ERROR * ADO_PRECISION
+BETABINOM_BETA = (1 - PROB_SEQ_ERROR) * ADO_PRECISION
 
 from scipy.stats import betabinom, binom
 import math
 # Calculating the optimal sigma
 
 def log_prob_absent(v,t):
-    prob = binom.logpmf(v,t,PROB_SEQ_ERROR)
+    prob = betabinom.logpmf(v,t, BETABINOM_ALPHA, BETABINOM_BETA)
     return prob
 
 def log_prob_present(v,t):

diff --git a/code/scarlet.py b/code/scarlet.py
@@ -11,6 +11,7 @@ def correct_ternary_matrix(B, S, BC, deletions):
     for deletion in deletions:
         print("  ----", deletion)
         child, mut = deletion
+        #print('DELETION: ', deletion, child, mut)
         child = int(child.replace('ANC:', ''))
         # calculate the set of copy-number states affected by the deletion
         # not the most efficient way to do this but S should be small
@@ -31,27 +32,33 @@ def correct_ternary_matrix(B, S, BC, deletions):
 def main():
     BC_file = sys.argv[1]
     SL_file = sys.argv[2]
-    output_file = sys.argv[3]
+
+    #TODO: Add input file containing list of germline mutations (SNPs)
+    SNP_file = sys.argv[3]
+
+    output_file = sys.argv[4]
 
-    BC, S, L = read_in_files(BC_file, SL_file)
-    mutations = sorted(set([v.split('_')[0] for v in BC.columns[1:]]))
-    sigmas, dels = get_optimal_sigma(S,BC,L)
+    BC, S, L, SNP = read_in_files(BC_file, SL_file, SNP_file)
+
+    mutations = sorted(set([v.rsplit('_', 1)[0] for v in BC.columns[1:]]))
+    sigmas, dels = get_optimal_sigma(S,BC,L,SNP)
     DPs = get_descendent_profiles(sigmas, mutations, S, L)
 
     all_deletions = dels
     cn_states = BC['c'].unique()
     Bs = {}
+
     for i in cn_states:
-
+        
         C= calculate_C(i, sigmas, DPs, BC)
-
+        
         B, deletions = solve_model(C)
+
         Bs[i]=B
         all_deletions += deletions
 
     print("All DELETIONS", all_deletions)
 
-
     result = assemble_mutation_matrix(Bs, sigmas, BC, mutations)
     result_with_ancestors = assemble_mutation_matrix_with_ancestors(Bs, sigmas, BC, mutations)