From d785523d6b69ffe978df29fc0ea5011b18772be2 Mon Sep 17 00:00:00 2001
From: Bolladeen <aj7381@princeton.edu>
Date: Mon, 7 Nov 2022 12:26:46 -0500
Subject: [PATCH 1/4] mod(scarlet): converted codebase python2 -> python3,
 removed hardcoded betabinomial pmf

---
 code/betabinom.py                | 1092 ------------------------------
 code/fileio.py                   |    2 +-
 code/optimize_mutation_matrix.py |   46 +-
 code/optimize_sigma.py           |   30 +-
 code/probmodels.py               |   12 +-
 code/scarlet.py                  |   11 +-
 6 files changed, 51 insertions(+), 1142 deletions(-)
 delete mode 100644 code/betabinom.py

diff --git a/code/betabinom.py b/code/betabinom.py
deleted file mode 100644
index 5446ac0..0000000
--- a/code/betabinom.py
+++ /dev/null
@@ -1,1092 +0,0 @@
-# Copyright Ian Smith 2002-2018
-# Above only included as I am told that without it, others may claim copyright and stop me from editting my own copies of my own code! No idea how true this is.
-# Beyond that, I am happy for anyone to make any edits they like to this code.
-# For example, raising ValueError in pmf, cdf and sf functions should perhaps only be done when the distribution objects are created
-
-# This code is basically translated from the VBA code available at https://iandjmsmith.wordpress.com
-
-import math
-import time
-
-NonIntegralValuesAllowed_Others = False
-
-OneOverSqrTwoPi = 0.39894228040143267793994605993438
-cfSmall = 0.00000000000001
-cfVSmall = 0.000000000000001
-scalefactor = 1.1579208923731619542357098500869e+77  # 2**256  ' used for rescaling calcs w/o impacting accuracy, to avoid over/underflow
-scalefactor2 = 8.6361685550944446253863518628004e-78 # 2**-256
-max_discrete = 9007199254740991.0
-minLog1Value = -0.79149064
-
-HalfMinusEulers_const = -0.07721566490153286060651209011
-Onep25Minusln2Minuseulers_const = -0.020362845461478170023744211558177
-coeffs0Minusp25 = 0.07246703342411321823620758332301
-
-# For logfbit functions                      # Stieltjes' continued fraction
-cf_0 = 1.0 / 12.0
-cf_1 = 1.0 / 30.0
-cf_2 = 53.0 / 210.0
-cf_3 = 195.0 / 371.0
-cf_4 = 22999.0 / 22737.0
-cf_5 = 29944523.0 / 19733142.0
-cf_6 = 109535241009.0 / 48264275462.0
-cf_7 = 3.0099173832593981700731407342077
-cf_8 = 4.026887192343901226168879531814
-cf_9 = 5.0027680807540300516885024122767
-cf_10 = 6.2839113708157821800726631549524
-cf_11 = 7.4959191223840339297523547082674
-cf_12 = 9.0406602343677266995311393604326
-cf_13 = 10.489303654509482277188371304593
-cf_14 = 12.297193610386205863989437140092
-cf_15 = 13.982876953992430188259760651279
-cf_16 = 16.053551416704935469715616365007
-cf_17 = 17.976607399870277592569472307671
-cf_18 = 20.309762027441653743805414720495
-cf_19 = 22.470471639933132495517941571508
-cf_20 = 25.065846548945972029163400322506
-cf_21 = 27.464451825029133609175558982646
-cf_22 = 30.321821231673047126882599306406
-cf_23 = 32.958533929972987219994066451412
-cf_24 = 36.077698931299242645153320900855
-cf_25 = 38.952706682311555734544390410481
-cf_26 = 42.333490043576957211381853948856
-cf_27 = 45.446960850061621014424175737541
-cf_28 = 49.089203129012597708164883350275
-cf_29 = 52.441288751415337312569856046996
-cf_30 = 56.344845345341843538420365947476
-cf_31 = 59.935683907165858207852583492752
-cf_32 = 64.100422755920354527906611892238
-cf_33 = 67.930140788018221186367702745199
-cf_34 = 72.355940555211701969680052963236
-cf_35 = 76.424654626829689752585090422288
-cf_36 = 81.111403237247965484814230985683
-cf_37 = 85.419221276410972614585638717349
-cf_38 = 90.366814723864108595513574581683
-cf_39 = 94.913837100009887953076231291987
-cf_40 = 100.12217846392919748899074683447
-
-lfbArray = [cf_0, cf_1, cf_2, cf_3, cf_4, cf_5, cf_6, cf_7, cf_8, cf_9, cf_10, cf_11, cf_12, cf_13, cf_14, cf_15, cf_16, cf_17, cf_18, cf_19, cf_20, cf_21, cf_22, cf_23, cf_24, cf_25, cf_26, cf_27, cf_28, cf_29]
-coeffs=[0.32246703342411321824,6.7352301053198095133e-02,2.0580808427784547879e-02,7.3855510286739852663e-03,2.8905103307415232858e-03,1.1927539117032609771e-03,5.0966952474304242234e-04,2.2315475845357937976e-04,\
-       9.9457512781808533715e-05,4.4926236738133141700e-05,2.0507212775670691553e-05,9.4394882752683959040e-06,4.3748667899074878042e-06,2.0392157538013662368e-06,9.5514121304074198329e-07,4.4924691987645660433e-07,\
-       2.1207184805554665869e-07,1.0043224823968099609e-07,4.7698101693639805658e-08,2.2711094608943164910e-08,1.0838659214896954091e-08,5.1834750419700466551e-09,2.4836745438024783172e-09,1.1921401405860912074e-09,\
-       5.7313672416788620133e-10,2.7595228851242331452e-10,1.3304764374244489481e-10,6.4229645638381000221e-11,3.1044247747322272762e-11,1.5021384080754142171e-11,7.2759744802390796625e-12,3.5277424765759150836e-12,\
-       1.7119917905596179086e-12,8.3153858414202848198e-13,4.0422005252894400655e-13,1.9664756310966164904e-13,9.5736303878385557638e-14,4.6640760264283742246e-14,2.2737369600659723206e-14,1.1091399470834522017e-14,\
-       5.4136591567253631315e-15,2.6438800178609949985e-15,1.2918959062789967293811764562316e-15,6.3159355041984485676779394847024e-16,1.421085482803160676983430580383e-14]
-coeffs2=[9.96703342411321823620758332301e-3,4.85230105319809513323333333333e-3,2.35164176111788121233425746862e-3,1.1355510286739852662730972914e-3,5.4676033074152328575298829848e-4,2.6269438789373716759012616902e-4,\
-        1.2601997117161385090708338501e-4,6.03943417869127130948e-5,2.89279988929196448257e-5,1.38537935563149598820e-5,6.63558635521614609862e-6,3.17947224962737026300e-6,1.52432377823166362836e-6,\
-        7.31319548444223379639e-7,3.51147479316783649952e-7,1.68754473874618369035e-7,8.11753732546888155551e-8,3.90847775936649142159e-8,1.88369052760822398681e-8,9.08717580313959348175e-9,4.38794008336117216467e-9,\
-        2.12078578473653627963e-9,1.02595225309999020577e-9,4.96752618206533915776e-10,2.40726205228207715756e-10,1.16751848407213311847e-10,5.66693373586358102002e-11,2.75272781658498271913e-11,\
-        1.33812334011666457419e-11,6.50929603319331702812e-12,3.16857905287746826547e-12,1.54339039998043529180e-12,7.52239805801019839357e-13,3.66855576849641617566e-13,1.79011840661361776213e-13,\
-        8.73989502840846835258e-14,4.26932273880725309600e-14,2.08656067727432658676e-14,1.02026669800712891581e-14,4.99113012967463749398e-15,2.44274876330334144841e-15,1.19604099925791156327e-15,\
-        5.85859784064943690566e-16,2.87087981566462948467e-16,1.40735520163755175636e-16]
-
-logfbit0p5 = 0.054814121051917653896138702348386     # logfbit(0.5)
-lfb_0 = 8.1061466795327258219670263594382e-02        # logfbit(0.0)
-lfb_1 = 4.1340695955409294093822081407118e-02        # logfbit(1.0)
-lfb_2 = 2.7677925684998339148789292746245e-02        # logfbit(2.0)
-lfb_3 = 2.0790672103765093111522771767849e-02        # logfbit(3.0)
-lfb_4 = 1.6644691189821192163194865373593e-02        # logfbit(4.0)
-lfb_5 = 1.3876128823070747998745727023763e-02        # logfbit(5.0)
-
-#For logfbit functions                   #Stirling's series for ln(Gamma(x)), A046968/A046969
-lfbc1 = 1.0 / 12.0
-lfbc2 = 1.0 / 30.0                       #lfbc2 on are Sloane's ratio times 12
-lfbc3 = 1.0 / 105.0
-lfbc4 = 1.0 / 140.0
-lfbc5 = 1.0 / 99.0
-lfbc6 = 691.0 / 30030.0
-lfbc7 = 1.0 / 13.0
-lfbc8 = .35068485511628418514 #Chosen to make logfbit(6) & logfbit(7) correct
-lfbc9 = 1.6769380337122674863 #Chosen to make logfbit(6) & logfbit(7) correct
-
-# Global variables used by functions hypergeometric & CBNB0
-ha1,hprob,hswap = 0.0,0.0,False
-
-def AlterForIntegralChecks_Others(value):
-# If non integral values are allowed then returns math.floor(value) otherwise only allows value if it equals math.floor(value)
-   if NonIntegralValuesAllowed_Others:
-      return math.floor(value)
-   elif value != math.floor(value):
-      raise ValueError
-   else:
-      return value
-
-def Generalabminuscd(a,b,c,d):
-     if ((a == 0.0) or (b == 0.0) or (c == 0.0) or (d == 0.0)):
-        return a*b-c*d
-     amr, ae = math.frexp(a)
-     bmr, be = math.frexp(b)
-     cmr, ce = math.frexp(c)
-     dmr, de = math.frexp(d)
-     am = int(amr * (2**53))
-     bm = int(bmr * (2**53))
-     cm = int(cmr * (2**53))
-     dm = int(dmr * (2**53))
-     abe = ae + be
-     cde = ce + de
-     abm = am * bm
-     cdm = cm * dm
-     scale = abe - cde
-     if scale >= 0:
-          if scale > 106:
-               scale = 0
-               cdm = 1
-          #return math.ldexp(abm * (2**scale) - cdm, abe - scale - 106)
-          return float(abm * (2**scale) - cdm) * (2.0 ** (abe - scale - 106))
-     else:
-          scale = - scale
-          if scale > 106:
-               scale = 0
-               abm = 1
-          #return math.ldexp(abm - cdm * (2**scale), cde - scale - 106)
-          return float(abm - cdm * (2**scale)) * (2.0 ** (cde - scale - 106))
-          
-def logfbit2dif(x):
-# Calculation of logfbit2(x)-logfbit2(1+x).
-  return 0.5 * (((x + 1.0) * (x + 2.0)) ** -2)
-
-def logfbit2(x):
-# Second derivative of error part of Stirling's formula where log(x!) = log(sqrt(twopi))+(x+0.5)*log(x+1)-(x+1)+logfbit(x).
-  if (x >= 10000000000.0):
-     return 2.0 * lfbc1 * ((x + 1.0) ** -3)
-  elif (x >= 7.0):
-     x1 = x + 1.0
-     x2 = 1.0 / (x1 * x1)
-     x3 = x2 * (240.0 * lfbc8 - x2 * 306.0 * lfbc9)
-     x3 = x2 * (132.0 * lfbc6 - x2 * (182.0 * lfbc7 - x3))
-     x3 = x2 * (56.0 * lfbc4 - x2 * (90.0 * lfbc5 - x3))
-     x3 = x2 * (12.0 * lfbc2 - x2 * (30.0 * lfbc3 - x3))
-     return lfbc1 * (2.0 - x3) * x2 / x1
-  elif (x > -1.0):
-     x1 = x
-     x2 = 0.0
-     while (x1 < 7.0):
-        x2 = x2 + logfbit2dif(x1)
-        x1 = x1 + 1.0
-     return x2 + logfbit2(x1)
-  else:
-     raise ValueError
-
-def logfbit4dif(x):
-# Calculation of logfbit4(x)-logfbit4(1+x).
-  return (10.0 * x * (x + 3.0) + 23.0) * (((x + 1.0) * (x + 2.0)) ** -4)
-
-def logfbit4(x):
-# Fourth derivative of error part of Stirling's formula where log(x!) = log(sqrt(twopi))+(x+0.5)*log(x+1)-(x+1)+logfbit(x).
-  if (x >= 10000000000.0):
-     return -0.5 * ((x + 1.0) ** -4)
-  elif (x >= 7.0):
-     x1 = x + 1.0
-     x2 = 1.0 / (x1 * x1)
-     x3 = x2 * (73440.0 * lfbc8 - x2 * 116280.0 * lfbc9)
-     x3 = x2 * (24024.0 * lfbc6 - x2 * (43680.0 * lfbc7 - x3))
-     x3 = x2 * (5040.0 * lfbc4 - x2 * (11880.0 * lfbc5 - x3))
-     x3 = x2 * (360.0 * lfbc2 - x2 * (1680.0 * lfbc3 - x3))
-     return lfbc1 * (24.0 - x3) * x2 * x2 / x1
-  elif (x > -1.0):
-     x1 = x
-     x2 = 0.0
-     while (x1 < 7.0):
-        x2 = x2 + logfbit4dif(x1)
-        x1 = x1 + 1.0
-
-     return x2 + logfbit4(x1)
-  else:
-     raise ValueError
-
-def log0(x):
-# Accurate and quicker calculation of log(1+x), particularly for small x. Code from Wolfgang Ehrhardt.
-   if (x > 4.0):
-      return math.log(1.0 + x)
-   else:
-      y = 1.0 + x
-      if (y == 1.0):
-         return x
-      else:
-         return math.log(y) + (x - (y - 1.0)) / y
-
-def logcf(x, i, d):
-# Continued fraction for calculation of 1/i + x/(i+d) + x*x/(i+2*d) + x*x*x/(i+3d) + ...
-     c1 = 2.0 * d
-     c2 = i + d
-     c4 = c2 + d
-     a1 = c2
-     b1 = i * (c2 - i * x)
-     b2 = d * d * x
-     a2 = c4 * c2 - b2
-     b2 = c4 * b1 - i * b2
-
-     while ((abs(a2 * b1 - a1 * b2) > abs(cfVSmall * b1 * a2))):
-
-       c3 = c2 * c2 * x
-       c2 = c2 + d
-       c4 = c4 + d
-       a1 = c4 * a2 - c3 * a1
-       b1 = c4 * b2 - c3 * b1
-
-       c3 = c1 * c1 * x
-       c1 = c1 + d
-       c4 = c4 + d
-       a2 = c4 * a1 - c3 * a2
-       b2 = c4 * b1 - c3 * b2
-       if (b2 > scalefactor):
-         a1 = a1 * scalefactor2
-         b1 = b1 * scalefactor2
-         a2 = a2 * scalefactor2
-         b2 = b2 * scalefactor2
-       elif (b2 < scalefactor2):
-         a1 = a1 * scalefactor
-         b1 = b1 * scalefactor
-         a2 = a2 * scalefactor
-         b2 = b2 * scalefactor
-     return a2 / b2
-
-def log1(x):
-# Accurate calculation of log(1+x)-x, particularly for small x.
-   if (abs(x) < 0.01):
-      term = x / (2.0 + x)
-      y = term * term
-      return term * ((((2.0 / 9.0 * y + 2.0 / 7.0) * y + 0.4) * y + 2.0 / 3.0) * y - x)
-   elif (x < minLog1Value or x > 1.0):
-      return math.log(1.0 + x) - x
-   else:
-      term = x / (2.0 + x)
-      y = term * term
-      return term * (2.0 * y * logcf(y, 3.0, 2.0) - x)
-
-def logfbitdif(x):
-# Calculation of logfbit(x)-logfbit(1+x). x must be > -1.
-  if x < -0.65:
-     return (x + 1.5) * log0(1.0 / (x + 1.0)) - 1.0
-  else:
-     y2 = (2.0 * x + 3.0) ** -2
-     return y2 * logcf(y2, 3.0, 2.0)
-
-def logfbita(x):
-# Error part of Stirling's formula where math.log(x!) = math.log(math.sqrtt(twopi))+(x+0.5)*math.log(x+1)-(x+1)+logfbita(x).
-# Are we ever concerned about the relative error involved in this function? I don't think so.
-  if (x >= 100000000.0):
-     return lfbc1 / (x + 1.0)
-  elif (x >= 6.0):                      # Abramowitz & Stegun's series 6.1.41
-     x1 = x + 1.0
-     x2 = 1.0 / (x1 * x1)
-     x3 = x2 * (lfbc6 - x2 * (lfbc7 - x2 * (lfbc8 - x2 * lfbc9)))
-     x3 = x2 * (lfbc4 - x2 * (lfbc5 - x3))
-     x3 = x2 * (lfbc2 - x2 * (lfbc3 - x3))
-     return lfbc1 * (1.0 - x3) / x1
-  elif (x == 0.0):
-     return lfb_0
-  elif (x == 1.0):
-     return lfb_1
-  elif (x == 2.0):
-     return lfb_2
-  elif (x == 3.0):
-     return lfb_3
-  elif (x == 4.0):
-     return lfb_4
-  elif (x == 5.0):
-     return lfb_5
-  elif (x > -1.0):
-     x1 = x
-     x2 = 0.0
-     while (x1 < 6.0):
-        x2 = x2 + logfbitdif(x1)
-        x1 = x1 + 1.0
-     return x2 + logfbit(x1)
-  else:
-     raise ValueError
-
-def logfbit(x):
-# Calculates log of x factorial - log(sqrt(2*pi)) +(x+1) -(x+0.5)*log(x+1)
-# using the error part of Stirling's formula (see Abramowitz & Stegun# s series 6.1.41)
-# and Stieltjes' continued fraction for the gamma function.
-# For x < 1.5, uses expansion of log(x!) and log((x+1)!) from Abramowitz & Stegun# s series 6.1.33
-# We are primarily concerned about the absolute error in this function.
-# Due to cancellation errors in calculating 1+x as x tends to -1, the function loses accuracy and should not be used!
-  if (x >= 6.0):
-     x1 = x + 1.0
-     if (x >= 1000.0):
-        if (x >= 100000000.0):
-           x3 = 0.0
-        else:
-           x2 = 1.0 / (x1 * x1)
-           x3 = x2 * (lfbc2 - x2 * lfbc3)
-
-     else:
-        x2 = 1.0 / (x1 * x1)
-        if x >= 40.0:
-           x3 = 0.0
-        elif x >= 15.0:
-           x3 = x2 * (lfbc6 - x2 * lfbc7)
-        else:
-           x3 = x2 * (lfbc6 - x2 * (lfbc7 - x2 * (lfbc8 - x2 * lfbc9)))
-
-        x3 = x2 * (lfbc4 - x2 * (lfbc5 - x3))
-        x3 = x2 * (lfbc2 - x2 * (lfbc3 - x3))
-
-     return lfbc1 * (1.0 - x3) / x1
-     # return (1.0 - x3) / (12.0 * x1)
-  elif (x == 0.0):
-     return lfb_0
-  elif (x == 1.0):
-     return lfb_1
-  elif (x == 2.0):
-     return lfb_2
-  elif (x == 3.0):
-     return lfb_3
-  elif (x == 4.0):
-     return lfb_4
-  elif (x == 5.0):
-     return lfb_5
-  elif x > 1.5:
-     x1 = x + 1.0
-     if x >= 2.5:
-        # x2 = 0.25 * ((math.sqrt(x1 * x1 + 81.0) - x1) + 81.0 / (x1 + math.sqrt(x1 * x1 + 90.25)))
-        x2 = 40.5 / (x1 + math.sqrt(x1 * x1 + 81.0))
-     else:
-        # x2 = 0.25 * ((math.sqrt(x1 * x1 + 225.0) - x1) + 225.0 / (x1 + math.sqrt(x1 * x1 + 240.25)))
-        x2 = 112.5 / (x1 + math.sqrt(x1 * x1 + 225.0))
-        x2 = cf_27 / (x1 + cf_28 / (x1 + cf_29 / (x1 + x2)))
-        x2 = cf_24 / (x1 + cf_25 / (x1 + cf_26 / (x1 + x2)))
-        x2 = cf_21 / (x1 + cf_22 / (x1 + cf_23 / (x1 + x2)))
-        x2 = cf_18 / (x1 + cf_19 / (x1 + cf_20 / (x1 + x2)))
-
-     x2 = cf_15 / (x1 + cf_16 / (x1 + cf_17 / (x1 + x2)))
-     x2 = cf_12 / (x1 + cf_13 / (x1 + cf_14 / (x1 + x2)))
-     x2 = cf_9 / (x1 + cf_10 / (x1 + cf_11 / (x1 + x2)))
-     x2 = cf_6 / (x1 + cf_7 / (x1 + cf_8 / (x1 + x2)))
-     x2 = cf_3 / (x1 + cf_4 / (x1 + cf_5 / (x1 + x2)))
-     # return cf_0 / (x1 + cf_1 / (x1 + cf_2 / (x1 + x2)))
-     return 1.0 / (12.0 * (x1 + cf_1 / (x1 + cf_2 / (x1 + x2))))
-  # elif (x == 1.5):
-  #    return 3.316287351993628748511050974106e-02   #  3.316287351993628748511050974106e-02
-  elif (x == 0.5):
-     return logfbit0p5                             #  5.481412105191765389613870234839e-02
-  elif (x == -0.5):
-     return 0.15342640972002734529138393927091     #  0.15342640972002734529138393927091
-  elif x >= -0.65:
-    if x <= 0.0:
-       i = len(coeffs)-1
-       lgam = coeffs[i] * logcf(-x / 2.0, i + 2.0, 1.0)
-       #print(i, coeffs[i],lgam)
-       for i in range(i-1,0,-1):
-          lgam = (coeffs[i] - x * lgam)
-          #print(i,lgam)
-       return ((coeffs0Minusp25 - (x * (1.0 / 3.0 - (x + 1.5) * logcf(-x, 4.0, 1.0)) + lgam) * x) * x + HalfMinusEulers_const) * x + lfb_0
-    elif x <= 1.56:
-       x = x - 1.0
-       i = len(coeffs2) + 2
-       lgam = ((x + 2.5) * logcf(-x / 2.0, i, 1.0) - (2.0 / (i - 1.0))) * (2.0 ** -i) + (3.0 ** -i) * logcf(-x / 3.0, i, 1.0)
-       #print(i,coeffs2[i-3],lgam)
-       for i in range(i-3,-1,-1):
-          lgam = (coeffs2[i] - x * lgam)
-          #print(i,lgam)
-       return (x * lgam + Onep25Minusln2Minuseulers_const) * x + lfb_1
-    elif x <= 2.5:
-       x = x - 2.0
-       i = len(coeffs)-1
-       lgam = coeffs[i] * logcf(-x / 2.0, i + 2.0, 1.0)
-       #print(i, coeffs[i],lgam)
-       for i in range(i-1,0,-1):
-          lgam = (coeffs[i] - x * lgam)
-          #print(lgam)
-       return ((x * x * (coeffs0Minus1Third - x * lgam) - (x + 2.5) * log1(x / 3.0) + log1(x / 2.0)) + FiveOver3Minusln3Minuseulers_const * x) + lfb_2
-    else:
-       x = x - 3.0
-       i = len(coeffs)-1
-       lgam = coeffs[i] * logcf(-x / 2.0, i + 2.0, 1.0)
-       #print(i, coeffs[i],lgam)
-       for i in range(i-1,0,-1):
-          lgam = (coeffs[i] - x * lgam)
-          #print(lgam)
-       return ((x * x * (coeffs0Minusp25 - x * lgam) - (x + 3.5) * log1(x / 4.0) + log1(x / 2.0) + log1(x / 3.0)) + Forty7Over48Minusln4Minuseulers_const * x) + lfb_3
-
-  elif x > -1.0:
-    return logfbitdif(x) + logfbit(x + 1.0)
-  else:
-     raise ValueError
-
-def lfbaccdif1(a , b):
-# Calculates logfbit(b)-logfbit(a+b) accurately for a > 0 & b >= 0. Reasonably accurate for a >=0 & b < 0.
-  if a < 0.0:
-     return -lfbaccdif1(-a, b + a)
-  elif (b >= 8.0):
-     y1 = b + 1.0
-     y2 = y1 ** -2
-     x1 = a + b + 1.0
-     x2 = x1 ** -2
-     x3 = x2 * lfbc9
-     y3 = y2 * lfbc9
-     acc = x2 * (a * (x1 + y1) * y3)
-     x3 = x2 * (lfbc8 - x3)
-     y3 = y2 * (lfbc8 - y3)
-     acc = x2 * (a * (x1 + y1) * y3 - acc)
-     x3 = x2 * (lfbc7 - x3)
-     y3 = y2 * (lfbc7 - y3)
-     acc = x2 * (a * (x1 + y1) * y3 - acc)
-     x3 = x2 * (lfbc6 - x3)
-     y3 = y2 * (lfbc6 - y3)
-     acc = x2 * (a * (x1 + y1) * y3 - acc)
-     x3 = x2 * (lfbc5 - x3)
-     y3 = y2 * (lfbc5 - y3)
-     acc = x2 * (a * (x1 + y1) * y3 - acc)
-     x3 = x2 * (lfbc4 - x3)
-     y3 = y2 * (lfbc4 - y3)
-     acc = x2 * (a * (x1 + y1) * y3 - acc)
-     x3 = x2 * (lfbc3 - x3)
-     y3 = y2 * (lfbc3 - y3)
-     acc = x2 * (a * (x1 + y1) * y3 - acc)
-     x3 = x2 * (lfbc2 - x3)
-     y3 = y2 * (lfbc2 - y3)
-     acc = x2 * (a * (x1 + y1) * y3 - acc)
-     # return lfbc1 * (a * (1.0 - y3) - y1 * acc) / (x1 * y1)
-     return (a * (1.0 - y3) - y1 * acc) / (12.0 * x1 * y1)
-  elif b >= 1.7:
-     y1 = b + 1.0
-     x1 = a + b + 1.0
-     if b >= 3.0:
-        Start = 17
-     else:
-        Start = 29
-
-     s1 = (0.5 * (Start + 1.0)) ** 2
-     s2 = (0.5 * (Start + 1.5)) ** 2
-     ty = y1 * math.sqrt(1.0 + s1 * (y1 ** -2))
-     tx = x1 * math.sqrt(1.0 + s1 * (x1 ** -2))
-     y2 = ty - y1
-     x2 = tx - x1
-     acc = a * (1.0 - (2.0 * y1 + a) / (tx + ty))
-     # Seems to work better without the next 2 lines. - Not with modification to s2
-     ty = y1 * math.sqrt(1.0 + s2 * (y1 ** -2))
-     tx = x1 * math.sqrt(1.0 + s2 * (x1 ** -2))
-     acc = 0.25 * (acc + s1 / ((y1 + ty) * (x1 + tx)) * a * (1.0 + (2.0 * y1 + a) / (tx + ty)))
-     y2 = 0.25 * (y2 + s1 / (y1 + ty))
-     x2 = 0.25 * (x2 + s1 / (x1 + tx))
-     for i in range(Start, 0, -1):
-        acc = lfbArray[i] * (a - acc) / ((x1 + x2) * (y1 + y2))
-        y2 = lfbArray[i] / (y1 + y2)
-        x2 = lfbArray[i] / (x1 + x2)
-     return cf_0 * (a - acc) / ((x1 + x2) * (y1 + y2))
-     # return (a - acc) / (12.0 * (x1 + x2) * (y1 + y2))
-  elif b > -1.0:
-    if b < -0.66:
-       if a > 1.0:
-          return logfbitdif(b) + lfbaccdif1(a - 1.0, b + 1.0)
-       elif a == 1.0:
-          return logfbitdif(b)
-       else:
-          s2 = a * log0(1.0 / (b + 1.0 + a))
-          s1 = logfbitdif(b + a)
-          if s1 > s2:
-             s1 = (b + 1.5) * log0(a / ((b + 1.0) * (b + 2.0 + a))) - s2
-          else:
-             s2 = s1
-             s1 = (logfbitdif(b) - s1)
-
-          if s1 > 0.1 * s2:
-             return s1 + lfbaccdif1(a, b + 1.0)
-
-    if b + a > 2:
-       s1 = lfbaccdif1(b + a - 1.75, 1.75)
-       a = 1.75 - b
-    else:
-       s1 = 0.0
-
-    y1 = b - 1.0
-    x1 = y1 + a
-    i = len(coeffs2) + 2
-    scale2 = 2.0 ** -i
-    scale3 = 3.0 ** -i
-    # y2 = ((y1 + 2.5) * logcf(-y1 / 2.0, i, 1.0) - (2.0 / (i - 1.0))) * scale2 + (scale3 * logcf(-y1 / 3.0, i, 1.0) + scale2 * scale2 * logcf(-y1 / 4.0, i, 1.0))
-    # x2 = ((x1 + 2.5) * logcf(-x1 / 2.0, i, 1.0) - (2.0 / (i - 1.0))) * scale2 + (scale3 * logcf(-x1 / 3.0, i, 1.0) + scale2 * scale2 * logcf(-x1 / 4.0, i, 1.0))
-    y2 = ((y1 + 2.5) * logcf(-y1 / 2.0, i, 1.0) - (2.0 / (i - 1.0))) * scale2 + scale3 * logcf(-y1 / 3.0, i, 1.0)
-    x2 = ((x1 + 2.5) * logcf(-x1 / 2.0, i, 1.0) - (2.0 / (i - 1.0))) * scale2 + scale3 * logcf(-x1 / 3.0, i, 1.0)
-    if a > 0.000006:
-       acc = y2 - x2  # This calculation is not accurate enough for b < 0 and a small - hence if b < 0 code above and derivative code below for small a
-    else:
-       y3 = -(y1 + a / 2.0) / 2.0
-       x3 = -(y1 + a / 2.0) / 3.0
-       acc = -a * (scale2 * (logcf(y3, i, 1.0) + (y3 - 1.25) * (1.0 / (1.0 - y3) - i * logcf(y3, i + 1.0, 1.0))) - scale3 / 3.0 * ((1.0 / (1.0 - x3) - i * logcf(x3, i + 1.0, 1.0))))
-
-    for i in range(i-3, -1, -1):
-       acc = (a * y2 - x1 * acc)
-       y2 = (coeffs2[i] - y1 * y2)
-       x2 = (coeffs2[i] - x1 * x2)
-    return s1 + (y1 * y1 * acc - a * (x2 * (x1 + y1) + Onep25Minusln2Minuseulers_const))
-  else:
-    raise ValueError
-
-def hypergeometricTerm(ai, aji, aki, amkji):
-# Probability that hypergeometric variate from a population with total type Is of aki+ai, total type IIs of amkji+aji, has ai type Is and aji type IIs selected.
-# Parameterised this way for use with the Beta Negative Binomial distribution.
-   ak = aki + ai
-   amk = amkji + aji
-   aj = aji + ai
-   am = amk + ak
-   amj = amkji + aki
-   if (am > max_discrete):
-      raise ValueError
-   if ((ai == 0.0) and ((aji <= 0.0) or (aki <= 0.0) or (amj < 0.0) or (amk < 0.0))):
-      return 1.0
-   elif ((ai > 0.0) and (min(aki, aji) == 0.0) and (max(amj, amk) == 0.0)):
-      return 1.0
-   elif ((ai >= 0.0) and (amkji > -1.0) and (aki > -1.0) and (aji >= 0.0)):
-     c1 = logfbit(amkji) + logfbit(aki) + logfbit(aji) + logfbit(am) + logfbit(ai)
-     c1 = logfbit(amk) + logfbit(ak) + logfbit(aj) + logfbit(amj) - c1
-     #c1 = lfbaccdif1(ak, amk) - lfbaccdif1(ai, aki) - lfbaccdif1(ai, aji) - lfbaccdif1(aki, amkji) - logfbit(ai)
-     ai1 = ai + 1.0
-     aj1 = aj + 1.0
-     ak1 = ak + 1.0
-     am1 = am + 1.0
-     aki1 = aki + 1.0
-     aji1 = aji + 1.0
-     amk1 = amk + 1.0
-     amj1 = amj + 1.0
-     amkji1 = amkji + 1.0
-     cjkmi = Generalabminuscd(aji, aki, ai, amkji)
-     c5 = (cjkmi - ai) / (amkji1 * am1)
-     if (c5 < minLog1Value):
-        c3 = amkji * (math.log((amj1 * amk1) / (amkji1 * am1)) - c5) - c5
-     else:
-        c3 = amkji * log1(c5) - c5
-
-     c5 = (-cjkmi - aji) / (aki1 * am1)
-     if (c5 < minLog1Value):
-        c4 = aki * (math.log((ak1 * amj1) / (aki1 * am1)) - c5) - c5
-     else:
-        c4 = aki * log1(c5) - c5
-
-     c3 = c3 + c4
-     c5 = (-cjkmi - aki) / (aji1 * am1)
-     if (c5 < minLog1Value):
-        c4 = aji * (math.log((aj1 * amk1) / (aji1 * am1)) - c5) - c5
-     else:
-        c4 = aji * log1(c5) - c5
-
-     c3 = c3 + c4
-     c5 = (cjkmi - amkji) / (ai1 * am1)
-     if (c5 < minLog1Value):
-        c4 = ai * (math.log((aj1 * ak1) / (ai1 * am1)) - c5) - c5
-     else:
-        c4 = ai * log1(c5) - c5
-
-     c3 = c3 + c4
-
-     return math.exp((c1 + 1.0 / am1) + c3) * math.sqrt((amk1 * ak1) * (aj1 * amj1) / ((amkji1 * aki1 * aji1) * (am1 * ai1))) * OneOverSqrTwoPi
-   else:
-     return 0.0
-
-def hypergeometric(ai, aji, aki, amkji, comp):#, ha1, hprob, hswap):
-# Probability that hypergeometric variate from a population with total type Is of aki+ai, total type IIs of amkji+aji, has up to ai type Is selected in a sample of size aji+ai.
-# Parameterised this way for use with the Beta Negative Binomial distribution.
-     global ha1,hprob,hswap
-     if (amkji > -1.0) and (amkji < 0.0):
-        ip1 = -amkji
-        mkji = ip1 - 1.0
-        allintegral = False
-     else:
-        ip1 = amkji + 1.0
-        mkji = amkji
-        allintegral = ai == math.floor(ai) and aji == math.floor(aji) and aki == math.floor(aki) and mkji == math.floor(mkji)
-
-     if allintegral:
-        swapped = (ai + 0.5) * (mkji + 0.5) >= (aki - 0.5) * (aji - 0.5)
-     elif ai < 100.0 and ai == math.floor(ai) or mkji < 0.0:
-        if comp:
-           swapped = (ai + 0.5) * (mkji + 0.5) >= aki * aji
-        else:
-           swapped = (ai + 0.5) * (mkji + 0.5) >= aki * aji + 1000.0
-
-     elif ai < 1.0:
-        swapped = (ai + 0.5) * (mkji + 0.5) >= aki * aji
-     elif aji < 1.0 or aki < 1.0 or (ai < 1.0 and ai > 0.0):
-        swapped = False
-     else:
-        swapped = (ai + 0.5) * (mkji + 0.5) >= (aki - 0.5) * (aji - 0.5)
-
-     if not swapped:
-       i = ai
-       ji = aji
-       ki = aki
-     else:
-       i = aji - 1.0
-       ji = ai + 1.0
-       ki = ip1
-       ip1 = aki
-       mkji = aki - 1.0
-
-     c2 = ji + i
-     c4 = mkji + ki + c2
-     if (c4 > max_discrete):
-       raise ValueError
-
-     if ((i >= 0.0) and (ji <= 0.0) or (ki <= 0.0) or (ip1 + ki <= 0.0) or (ip1 + ji <= 0.0)):
-        exact = True
-        if (i >= 0.0):
-           prob = 1.0
-        else:
-           prob = 0.0
-
-     elif (ip1 > 0.0) and (ip1 < 1.0):
-        exact = False
-        prob = hypergeometricTerm(i, ji, ki, ip1) * (ip1 * (c4 + 1.0)) / ((ki + ip1) * (ji + ip1))
-     else:
-        exact = ((i == 0.0) and ((ji <= 0.0) or (ki <= 0.0) or (mkji + ki < 0.0) or (mkji + ji < 0.0))) or ((i > 0.0) and (min(ki, ji) == 0.0) and (max(mkji + ki, mkji + ji) == 0.0))
-        prob = hypergeometricTerm(i, ji, ki, mkji)
-
-     hprob = prob
-     hswap = swapped
-     ha1 = 0.0
-
-     if (exact) or (prob == 0.0):
-        if (swapped == comp):
-           return prob
-        else:
-           return 1.0 - prob
-
-     a1 = 0.0
-     sumAlways = 0.0
-     sumFactor = 10.0
-
-     if i < mkji:
-        must_do_cf = i != math.floor(i)
-        maxSums = math.floor(i)
-     else:
-        must_do_cf = mkji != math.floor(mkji)
-        maxSums = math.floor(max(mkji, 0.0))
-
-     if must_do_cf:
-        sumAlways = 0.0
-        sumFactor = 5.0
-     else:
-        sumAlways = 20.0
-        sumFactor = 10.0
-
-     if (maxSums > sumAlways or must_do_cf):
-        numb = math.floor(sumFactor / c4 * math.exp(math.log((ki + i) * (ji + i) * (ip1 + ji) * (ip1 + ki)) / 3.0))
-        numb = math.floor(i - (ki + i) * (ji + i) / c4 + numb)
-        if (numb < 0.0):
-           numb = 0.0
-        elif numb > maxSums:
-           numb = maxSums
-     else:
-        numb = maxSums
-
-     if (2.0 * numb <= maxSums or must_do_cf):
-        b1 = 1.0
-        c1 = 0.0
-        c2 = i - numb
-        c3 = mkji - numb
-        s = c3
-        a2 = c2
-        c3 = c3 - 1.0
-        b2 = Generalabminuscd(ki + numb + 1.0, ji + numb + 1.0, c2 - 1.0, c3)
-        bn = b2
-        bnAdd = c3 + c4 + c2 - 2.0
-        while (b2 > 0.0 and (abs(a2 * b1 - a1 * b2) > abs(cfVSmall * b1 * a2))):
-            c1 = c1 + 1.0
-            c2 = c2 - 1.0
-            an = (c1 * c2) * (c3 * c4)
-            c3 = c3 - 1.0
-            c4 = c4 - 1.0
-            bn = bn + bnAdd
-            bnAdd = bnAdd - 4.0
-            a1 = bn * a2 + an * a1
-            b1 = bn * b2 + an * b1
-            if (b1 > scalefactor):
-              a1 = a1 * scalefactor2
-              b1 = b1 * scalefactor2
-              a2 = a2 * scalefactor2
-              b2 = b2 * scalefactor2
-            c1 = c1 + 1.0
-            c2 = c2 - 1.0
-            an = (c1 * c2) * (c3 * c4)
-            c3 = c3 - 1.0
-            c4 = c4 - 1.0
-            bn = bn + bnAdd
-            bnAdd = bnAdd - 4.0
-            a2 = bn * a1 + an * a2
-            b2 = bn * b1 + an * b2
-            if (b2 > scalefactor):
-              a1 = a1 * scalefactor2
-              b1 = b1 * scalefactor2
-              a2 = a2 * scalefactor2
-              b2 = b2 * scalefactor2
-
-        if b1 < 0.0 or b2 < 0.0:
-           raise ValueError # Actually, something unexpected has happened here - although it will be to do with the values input.
-        else:
-           a1 = a2 / b2 * s
-
-     else:
-        numb = maxSums
-
-     c1 = i - numb + 1.0
-     c2 = mkji - numb + 1.0
-     c3 = ki + numb
-     c4 = ji + numb
-     while (numb > 0.0):
-       a1 = (1.0 + a1) * ((c1 * c2) / (c3 * c4))
-       c1 = c1 + 1.0
-       c2 = c2 + 1.0
-       c3 = c3 - 1.0
-       c4 = c4 - 1.0
-       numb = numb - 1.0
-
-     ha1 = a1
-     a1 = (1.0 + a1) * prob
-     if (swapped == comp):
-        return a1
-     else:
-        if a1 > 0.99:
-           raise ValueError # Similarly, if we end up here then something unexpected has happened - although it will be to do with the values input.
-        else:
-           return 1.0 - a1
-
-def PBB(i, ssmi, beta_shape1, beta_shape2):
-   global hTerm
-   hTerm = hypergeometricTerm(i, ssmi, beta_shape1, beta_shape2)
-   return (beta_shape1 / (i + beta_shape1)) * (beta_shape2 / (beta_shape1 + beta_shape2)) * ((i + ssmi + beta_shape1 + beta_shape2) / (ssmi + beta_shape2)) * hTerm
-
-def pmf_BetaNegativeBinomial(i,r, beta_shape1, beta_shape2):
-   i = AlterForIntegralChecks_Others(i)
-   if (r <= 0.0 or beta_shape1 <= 0.0 or beta_shape2 <= 0.0):
-      raise ValueError
-   elif i < 0:
-      return 0.0
-   else:
-      return (beta_shape2 / (beta_shape1 + beta_shape2)) * (r / (beta_shape1 + r)) * beta_shape1 * (i + beta_shape1 + r + beta_shape2) / ((i + r) * (i + beta_shape2)) * hypergeometricTerm(i, r, beta_shape2, beta_shape1)
-
-def CBNB0(i,r, beta_shape1, beta_shape2, toBeAdded):
-   global ha1,hprob,hswap
-   if (r < 2.0 or beta_shape2 < 2.0):
-# Assumption here that i is integral or greater than 4.
-      mrb2 = max(r, beta_shape2)
-      other = min(r, beta_shape2)
-      cbnb0 = PBB(i, other, mrb2, beta_shape1)
-      if i == 0.0:
-         return cbnb0
-      cbnb0 = cbnb0 * (1.0 + i * (other + beta_shape1) / (((i - 1.0) + mrb2) * (other + 1.0)))
-      if i == 1.0:
-         return cbnb0
-      i = i - 2.0
-      other = other + 2.0
-      temp = PBB(i, mrb2, other, beta_shape1)
-      if i == 0.0:
-         return cbnb0 + temp
-      cbnb0 = cbnb0 + temp * (1.0 + i * (mrb2 + beta_shape1) / (((i - 1.0) + other) * (mrb2 + 1.0)))
-      if i == 1.0:
-         return cbnb0
-      i = i - 2.0
-      mrb2 = mrb2 + 2.0
-      return cbnb0 + CBNB0(i, mrb2, beta_shape1, other, cbnb0)
-   elif (beta_shape1 < 1.0):
-      mrb2 = max(r, beta_shape2)
-      other = min(r, beta_shape2)
-      cbnb0 = hypergeometric(i, mrb2 - 1.0, other, beta_shape1, False)#, ha1, hprob, hswap)
-      if hswap:
-         temp = PBB(mrb2 - 1.0, beta_shape1, i + 1.0, other)
-         if (toBeAdded + (cbnb0 - temp)) < 0.01 * (toBeAdded + (cbnb0 + temp)):
-            return CBNB2(i, mrb2, beta_shape1, other)
-         else:
-            return cbnb0 - temp
-
-      elif ha1 < -0.9 * beta_shape1 / (beta_shape1 + other):
-         raise ValueError
-      else:
-         return hprob * (beta_shape1 / (beta_shape1 + other) + ha1)
-   else:
-      return hypergeometric(i, r, beta_shape2, beta_shape1 - 1.0, False)#, ha1, hprob, hswap)
-
-def CBNB2(i,r, beta_shape1, beta_shape2):
-   ss = min(r, beta_shape2)
-   bs2 = max(r, beta_shape2)
-   r = ss
-   beta_shape2 = bs2
-   d1 = (i + 0.5) * (beta_shape1 + 0.5) - (bs2 - 1.5) * (ss - 0.5)
-   if d1 < 0.0:
-      return CBNB0(i, ss, beta_shape1, bs2, 0.0)
-
-   d1 = math.floor(d1 / (bs2 + beta_shape1 - 1.0)) + 10.0
-   if ss + d1 > bs2: d1 = math.floor(bs2 - ss)
-   ss = ss + d1
-   j = i - d1
-   d2 = (j + 0.5) * (beta_shape1 + 0.5) - (bs2 - 1.5) * (ss - 0.5)
-   if d2 < 0.0:
-      d2 = 10.0
-   else:
-      temp = bs2 + ss + 2.0 * beta_shape1 - 1.0
-      d2 = math.floor((math.sqrt(temp * temp + 4.0 * d2) - temp) / 2.0) + 10.0
-
-   if 2.0 * d2 > i:
-      d2 = math.floor(i / 2.0)
-
-   pbbval = PBB(i, r, beta_shape2, beta_shape1)
-   ss = ss + d2
-   bs2 = bs2 + d2
-   j = j - 2.0 * d2
-   cbnb2 = CBNB0(j, ss, beta_shape1, bs2, 0.0)
-   temp = 1.0
-   d_count = d2 - 2.0
-   # print(d1,d2)
-   j = j + 1.0
-   while d_count >= 0.0:
-      j = j + 1.0
-      bs2 = beta_shape2 + d_count
-      d_count = d_count - 1.0
-      temp = 1.0 + (j * (bs2 + beta_shape1) / ((j + ss - 1.0) * (bs2 + 1.0))) * temp
-
-   j = i - d2 - d1
-   temp = (ss * (j + bs2)) / (bs2 * (j + ss)) * temp
-   d_count = d1 + d2 - 1.0
-   while d_count >= 0:
-      j = j + 1.0
-      ss = r + d_count
-      d_count = d_count - 1.0
-      temp = 1.0 + (j * (ss + beta_shape1) / ((j + bs2 - 1.0) * (ss + 1.0))) * temp
-
-   return cbnb2 + temp * pbbval
-
-def ccBNB5(ilim , rr , a , bb):
-   if rr > bb:
-      r = rr
-      b = bb
-   else:
-      r = bb
-      b = rr
-
-   ccbnb5 = (a + 0.5) * log0(b * r / ((a + 1.0) * (b + a + r + 1.0))) - r * log0(b / (r + a + 1.0)) - b * log0(r / (a + b + 1.0))
-   if r <= 0.001:
-      temp = a + (b + r) * 0.5
-      ccbnb5 = ccbnb5  - b * r * (logfbit2(temp) + (b ** 2 + r ** 2) * logfbit4(temp) / 24.0)
-   else:
-      ccbnb5 = ccbnb5  + (lfbaccdif1(b, r + a) - lfbaccdif1(b, a))
-
-   temp = 0.0
-   if ilim > 0.0:
-      i = ilim
-      while i > 1.0:
-         i = i - 1.0
-         temp = (1.0 + temp) * (i + r) * (i + b) / ((i + r + a + b) * (i + 1.0))
-
-      temp = (1.0 + temp) * math.exp(ccbnb5) * a
-
-   return (r * b * (1.0 - temp) - math.expm1(ccbnb5) * a * (r + a + b)) / ((r + a) * (a + b))
-
-def cdf_BetaNegativeBinomial(i,r, beta_shape1, beta_shape2):
-   i = math.floor(i)
-   if (r <= 0.0 or beta_shape1 <= 0.0 or beta_shape2 <= 0.0):
-      raise ValueError
-   elif i < 0:
-      return 0.0
-   else:
-      return CBNB0(i, r, beta_shape1, beta_shape2, 0.0)
-
-def sf_BetaNegativeBinomial(i,r, beta_shape1, beta_shape2):
-   i = math.floor(i)
-   mrb2 = max(r, beta_shape2)
-   other = min(r, beta_shape2)
-   if (other <= 0.0 or beta_shape1 <= 0.0 or beta_shape2 <= 0.0):
-      raise ValueError
-   elif i < 0.0:
-      return 1.0
-   elif (i == 0.0) or ((i < 1000000.0) and (other < 0.001) and (beta_shape1 > 50.0 * other) and (10.0 * i * beta_shape1 < mrb2)):
-      return ccBNB5(i, mrb2, beta_shape1, other)
-   elif (mrb2 >= 100.0 or other > 20.0 or (mrb2 >= 5.0 and (other - 0.5) * (mrb2 - 0.5) > (i + 0.5) * (beta_shape1 + 0.5))):
-      return CBNB0(mrb2 - 1.0, i + 1.0, other, beta_shape1, 0.0)
-   else:
-      ccbnb = 0.0
-      temp = 0.0
-      i = i + 1.0
-      if other >= 1.0:
-         mrb2 = mrb2 - 1.0
-         other = other - 1.0
-         temp = hypergeometricTerm(i, mrb2, other, beta_shape1)
-         ccbnb = temp
-         while (other >= 1.0) and (temp > 1e-16 * ccbnb):
-            i = i + 1.0
-            beta_shape1 = beta_shape1 + 1.0
-            temp = temp * (mrb2 * other) / (i * beta_shape1)
-            mrb2 = mrb2 - 1.0
-            other = other - 1.0
-            ccbnb = ccbnb + temp
-
-         if other >= 1.0:
-            return ccbnb
-         i = i + 1.0
-         beta_shape1 = beta_shape1 + 1.0
-
-      if mrb2 >= 1.0:
-         mxib1 = max(i, beta_shape1)
-         mnib1 = min(i, beta_shape1)
-         if temp == 0.0:
-            mrb2 = mrb2 - 1.0
-            temp = PBB(mnib1, mrb2, other, mxib1)
-         else: # temp is hypergeometricTerm(mnib1-1, mrb2, other, mxib1-1)
-            temp = temp * other * mrb2
-            mrb2 = mrb2 - 1.0
-            temp = temp / (mnib1 * (mrb2 + mxib1))
-         
-         ccbnb = ccbnb + temp
-         while (mrb2 >= 1.0) and (temp > 1e-16 * ccbnb):
-            temp = temp * mrb2 * (mnib1 + other)
-            mnib1 = mnib1 + 1.0
-            if mnib1 > mxib1:
-               mxib1, mnib1 = mnib1, mxib1
-            
-# Block below not required if hypergeometric block included above and therefore other guaranteed < 1 <= mrb2
-            # if mrb2 < other:
-            #    other, mrb2 = mrb2, other
-            # 
-            mrb2 = mrb2 - 1.0
-            temp = temp / ((mrb2 + mxib1) * mnib1)
-            ccbnb = ccbnb + temp
-
-         if mrb2 >= 1.0:
-            return ccbnb
-         temp = temp * mrb2 / (mnib1 + mrb2)
-      else:
-         mxib1 = beta_shape1
-         mnib1 = i
-         if temp == 0.0:
-            temp = pmf_BetaNegativeBinomial(mnib1, mrb2, mxib1, other)
-         else:
-            temp = temp * mrb2 * other / (i * (mrb2 + other + mnib1 + mxib1 + -1))
-         
-         ccbnb = ccbnb + temp
-      
-      max_iterations = 60.0
-      while True:
-         temp = temp * (mnib1 + mrb2) * (mnib1 + other) / (mnib1 + mxib1 + mrb2 + other)
-         mnib1 = mnib1 + 1.0
-         if mxib1 < mnib1:
-            mxib1, mnib1 = mnib1, mxib1
-         
-         temp = temp / mnib1
-         ccbnb = ccbnb + temp
-         if (temp <= 1e-16 * ccbnb) or (mnib1 + mxib1 > max_iterations):
-            break
-
-      temp = temp * (mnib1 + mrb2) * (mnib1 + other) / ((mnib1 + 1.0) * mxib1)
-      ccbnb = ccbnb + temp
-      mnib1 = mnib1 + 1.0
-      mrb2 = mrb2 - 1.0
-      other = other - 1.0
-      while True:
-         mnib1 = mnib1 + 1.0
-         mxib1 = mxib1 + 1.0
-         temp = temp * (mrb2 * other) / (mnib1 * mxib1)
-         mrb2 = mrb2 - 1.0
-         other = other - 1.0
-         ccbnb = ccbnb + temp
-         if abs(temp) <= 1e-16 * ccbnb:
-            break
-   
-   return ccbnb
-
-def pmf_BetaBinomial(i, sample_size, beta_shape1, beta_shape2):
-   i = AlterForIntegralChecks_Others(i)
-   sample_size = AlterForIntegralChecks_Others(sample_size)
-# These probably should be checked once, when creating a BetaBinomial distribution object
-   if ((beta_shape1 <= 0.0) or (beta_shape2 <= 0.0) or (sample_size < 0.0)):
-      raise ValueError
-   elif ((i < 0) or (i > sample_size)):
-      return 0.0
-   else:
-      return (beta_shape1 / (i + beta_shape1)) * (beta_shape2 / (beta_shape1 + beta_shape2)) * ((sample_size + beta_shape1 + beta_shape2) / (sample_size - i + beta_shape2)) * hypergeometricTerm(i, sample_size - i, beta_shape1, beta_shape2)
-
-def cdf_BetaBinomial(i, sample_size, beta_shape1, beta_shape2):
-   i = math.floor(i)
-   sample_size = AlterForIntegralChecks_Others(sample_size)
-   if ((beta_shape1 <= 0.0) or (beta_shape2 <= 0.0) or (sample_size < 0.0)):
-      raise ValueError
-   elif (i < 0.0):
-      return 0.0
-   else:
-      i = i + 1.0
-      return sf_BetaNegativeBinomial(sample_size - i, i, beta_shape1, beta_shape2)
-
-def sf_BetaBinomial(i, sample_size, beta_shape1, beta_shape2):
-   i = math.floor(i)
-   sample_size = AlterForIntegralChecks_Others(sample_size)
-   if ((beta_shape1 <= 0.0) or (beta_shape2 <= 0.0) or (sample_size < 0.0)):
-      raise ValueError
-   elif (i < 0.0):
-      return 1.0
-   elif (i >= sample_size):
-      return 0.0
-   else:
-      return sf_BetaNegativeBinomial(i, sample_size - i, beta_shape2, beta_shape1)
-
-# Hypergeometric & Negative Hpergeometric pmfs, cdfs & sfs for completeness
-def pmf_hypergeometric(type1s, sample_size, tot_type1, pop_size):
-   type1s = AlterForIntegralChecks_Others(type1s)
-   sample_size = AlterForIntegralChecks_Others(sample_size)
-   tot_type1 = AlterForIntegralChecks_Others(tot_type1)
-   pop_size = AlterForIntegralChecks_Others(pop_size)
-   if ((sample_size < 0.0) or (tot_type1 < 0.0) or (sample_size > pop_size) or (tot_type1 > pop_size)):
-      raise ValueError
-   else:
-      return hypergeometricTerm(type1s, sample_size - type1s, tot_type1 - type1s, pop_size - tot_type1 - sample_size + type1s)
-
-def cdf_hypergeometric(type1s, sample_size, tot_type1, pop_size):
-   type1s = math.floor(type1s)
-   sample_size = AlterForIntegralChecks_Others(sample_size)
-   tot_type1 = AlterForIntegralChecks_Others(tot_type1)
-   pop_size = AlterForIntegralChecks_Others(pop_size)
-   if ((sample_size < 0.0) or (tot_type1 < 0.0) or (sample_size > pop_size) or (tot_type1 > pop_size)):
-      raise ValueError
-   else:
-      return hypergeometric(type1s, sample_size - type1s, tot_type1 - type1s, pop_size - tot_type1 - sample_size + type1s, False)
-
-def sf_hypergeometric(type1s, sample_size, tot_type1, pop_size):
-   type1s = math.floor(type1s)
-   sample_size = AlterForIntegralChecks_Others(sample_size)
-   tot_type1 = AlterForIntegralChecks_Others(tot_type1)
-   pop_size = AlterForIntegralChecks_Others(pop_size)
-   if ((sample_size < 0.0) or (tot_type1 < 0.0) or (sample_size > pop_size) or (tot_type1 > pop_size)):
-      raise ValueError
-   else:
-      return hypergeometric(type1s, sample_size - type1s, tot_type1 - type1s, pop_size - tot_type1 - sample_size + type1s, True)
-   
-def pmf_neghypergeometric(type2s, type1s_reqd, tot_type1, pop_size):
-   type2s = AlterForIntegralChecks_Others(type2s)
-   type1s_reqd = AlterForIntegralChecks_Others(type1s_reqd)
-   tot_type1 = AlterForIntegralChecks_Others(tot_type1)
-   pop_size = AlterForIntegralChecks_Others(pop_size)
-   if ((type1s_reqd <= 0.0) or (tot_type1 < type1s_reqd) or (tot_type1 > pop_size)):
-      raise ValueError
-   elif (type2s < 0.0) or (tot_type1 + type2s > pop_size):
-      if type2s == 0.0:
-         return 1.0
-      else:
-         return 0.0
-   else:
-      return hypergeometricTerm(type1s_reqd - 1.0, type2s, tot_type1 - type1s_reqd + 1.0, pop_size - tot_type1 - type2s) * (tot_type1 - type1s_reqd + 1.0) / (pop_size - type1s_reqd - type2s + 1.0)
-   
-def cdf_neghypergeometric(type2s, type1s_reqd, tot_type1, pop_size):
-   type2s = math.floor(type2s)
-   type1s_reqd = AlterForIntegralChecks_Others(type1s_reqd)
-   tot_type1 = AlterForIntegralChecks_Others(tot_type1)
-   pop_size = AlterForIntegralChecks_Others(pop_size)
-   if ((type1s_reqd <= 0.0) or (tot_type1 < type1s_reqd) or (tot_type1 > pop_size)):
-      raise ValueError
-   elif (tot_type1 + type2s > pop_size):
-      return 1.0
-   else:
-      return hypergeometric(type2s, type1s_reqd, pop_size - tot_type1 - type2s, tot_type1 - type1s_reqd, False)
-
-def sf_neghypergeometric(type2s, type1s_reqd, tot_type1, pop_size):
-   type2s = math.floor(type2s)
-   type1s_reqd = AlterForIntegralChecks_Others(type1s_reqd)
-   tot_type1 = AlterForIntegralChecks_Others(tot_type1)
-   pop_size = AlterForIntegralChecks_Others(pop_size)
-   if ((type1s_reqd <= 0.0) or (tot_type1 < type1s_reqd) or (tot_type1 > pop_size)):
-      raise ValueError
-   elif (tot_type1 + type2s > pop_size):
-      return 0.0
-   else:
-      return hypergeometric(type2s, type1s_reqd, pop_size - tot_type1 - type2s, tot_type1 - type1s_reqd, True)
-
-
-
diff --git a/code/fileio.py b/code/fileio.py
index 5f34be4..b6e4044 100644
--- a/code/fileio.py
+++ b/code/fileio.py
@@ -10,7 +10,7 @@ def read_in_files(input_file, state_tree_file):
     with open(state_tree_file) as f:
         for line in f:
             line_split = line.strip().split(',')
-            edge = map(int, line_split[:2])
+            edge = list(map(int, line_split[:2]))
             S.append(edge)
             try: 
                 L[tuple(edge)] = line_split[2:]
diff --git a/code/optimize_mutation_matrix.py b/code/optimize_mutation_matrix.py
index 7140834..7289b6f 100644
--- a/code/optimize_mutation_matrix.py
+++ b/code/optimize_mutation_matrix.py
@@ -81,13 +81,13 @@ def solve_model(C):
                 try:
                     p,a = v.varName.split('_')[1:]
                 except:
-                    print v.varName
+                    print(v.varName)
                     raise
                 B.loc[p][a] = v.x
 
         
-        print "Optimized B --------------------"
-        print B
+        print("Optimized B --------------------")
+        print(B)
 
         deletions = output_with_deletions(C,m)
         return B, deletions
@@ -100,17 +100,17 @@ def solve_model(C):
 
 def output_with_deletions(C,m):
     deletions = []
-    print "C COLUMNS", C.columns
-    print C 
+    print("C COLUMNS", C.columns)
+    print(C)
     for v in C.index:
         if v.startswith('ANC:'):
             for c in C.columns:
                 if C.loc[v][c] == EPSILON: 
                     value =  m.getVarByName("b_{}_{}".format(v, c)).x
                     if value == 1: 
-                        print "DELETION DETECTED", v,c
+                        print("DELETION DETECTED", v,c)
                         deletions.append((v,c))
-                    print "---------------------------------------------------------------- EPSILON"
+                    print("---------------------------------------------------------------- EPSILON")
     return deletions
     #raise Exception()
 
@@ -127,26 +127,26 @@ def calculate_C(c, sigmas, DPs, BC):
         C[a] = BC[BC['c']==c].apply(lambda x: calc_c_observed_cell(x['{}_v'.format(a)], \
                x['{}_t'.format(a)]), axis=1)
     
-    print "MIXED MUTS FOR STATE {}: {}".format(c, mixed_muts)
+    print("MIXED MUTS FOR STATE {}: {}".format(c, mixed_muts))
     def desc_scores(v):
-        print v
+        print(v)
         if v == 1: return 100000
         elif v == 0: return -100000
         else: 
             global EPSILON
-            print "------------------------------", EPSILON
+            print("------------------------------", EPSILON)
             return EPSILON
     
 
     try:
         descs = DPs[c]
-        print "----------------- DESCS", descs
+        print("----------------- DESCS", descs)
     except KeyError:
         descs=[]
 
     for i,D in enumerate(descs):
         child, desc = D
-        print child
+        print(child)
         d = pd.DataFrame([[desc_scores(desc[a]) for a in mixed_muts]], columns = mixed_muts,\
             index = ['ANC:{}'.format(child)])
         C = C.append(d)
@@ -160,27 +160,27 @@ def calc_c_observed_cell(v,t):
 
 def get_descendent_profiles(sigmas, mutations, S, L):
     DPs = {}
-    print L
+    print(L)
     for edge in S:
         parent, child = edge
-        print edge, L[tuple(edge)]
+        print(edge, L[tuple(edge)])
         child_status = {m:sigmas[m][child] for m in mutations}
 
         parent_status = {m:sigmas[m][parent] for m in mutations}
 
-        print "child status", child_status
-        print "parent status", parent_status
+        print("child status", child_status)
+        print("parent status", parent_status)
         child_founder_profile = {m:1 if child_status[m] == 'Present' and parent_status[m] in ['Mixed', 'Present'] \
                                  else 0 for m in mutations}
         descendent_profile = {m:1 if child_founder_profile[m] == 1 else '?' if m in L[tuple(edge)] \
                               and parent_status[m] in ['Mixed', 'Present'] else 0 for m in mutations}
 
-        print [descendent_profile[v] for v in descendent_profile]
+        print([descendent_profile[v] for v in descendent_profile])
         if parent not in DPs:
             DPs[parent] = []
         DPs[parent].append((child, descendent_profile)) 
-        #print "DPS"
-        #print (child, descendent_profile)
+        #print("DPS")
+        #print(child, descendent_profile)
 
     return DPs 
 
@@ -188,8 +188,8 @@ def assemble_mutation_matrix(Bs,sigmas, BC, mutations):
 
     # Bs give partial information and sigmas assemble the rest
     B_tot = pd.DataFrame(columns = mutations, index = BC.index)
-    #print mutations
-    #print Bs
+    #print(mutations)
+    #print(Bs)
 
     for i in Bs:
 
@@ -218,7 +218,7 @@ def assemble_mutation_matrix_with_ancestors(Bs,sigmas, BC, mutations):
     index = []
     for i in Bs:
             index+=Bs[i].index.tolist()
-            print Bs[i].index
+            print(Bs[i].index)
 
 
     # Bs give partial information and sigmas assemble the rest
@@ -270,4 +270,4 @@ def assemble_mutation_matrix_with_ancestors(Bs,sigmas, BC, mutations):
 
     B = solve_model(C)
 
-    #print B
+    #print(B)
diff --git a/code/optimize_sigma.py b/code/optimize_sigma.py
index 7c1edba..3728fc1 100644
--- a/code/optimize_sigma.py
+++ b/code/optimize_sigma.py
@@ -25,7 +25,7 @@ def get_optimal_sigma(S, BC, L):
     subtrees = enum_all_subtrees(num_states, S)
 
     sigmas = {}
-    print L
+    print(L)
 
     deletions = []
     for mutation in mutations:
@@ -66,8 +66,12 @@ def get_optimal_sigma(S, BC, L):
         sigmas[mutation] = max_sigma
 
     sigma_new = pd.DataFrame(sigmas)
-    print "SIGMA"
-    print sigma_new
+
+    # Reindex columns of dataframe to be in alphabetical order
+    sigma_new = sigma_new.reindex(sorted(sigma_new.columns), axis=1)
+
+    print("SIGMA")
+    print(sigma_new)
 
     return sigma_new, deletions
 
@@ -89,7 +93,7 @@ def log_prob_sigma(V,T,C,sigma):
 
     log_prob = 0
     for i,R in enumerate(zip(V,T,C)):
-        # print '-------'
+        # print('-------')
         v,t,c = R
 
         status = sigma[c]
@@ -136,7 +140,7 @@ def enum_rooted_subtrees(subtree, edgelist, frontier, treelist = []):
 
     # TEST
 
-    print "Testing Opitimize Sigma"
+    print("Testing Opitimize Sigma")
     input_file = "test_data/BC.csv"
     state_tree = "test_data/S.csv"
     BC = pd.read_csv(input_file)
@@ -148,16 +152,16 @@ def enum_rooted_subtrees(subtree, edgelist, frontier, treelist = []):
     with open(state_tree) as f:
         for line in f:
             line_split = line.strip().split(',')
-            edge = map(int, line_split[:2])
+            edge = list(map(int, line_split[:2]))
             S.append(edge)
             try: 
                 L[tuple(edge)] = line_split[2:]
             except:
                 L[tuple(edge)] = []
-    print "Input data"
-    print "B,C"
-    print BC
-    print "State tree S"
-    print S
-    print "Result"
-    print get_optimal_sigma(S,BC)
+    print("Input data")
+    print("B,C")
+    print(BC)
+    print("State tree S")
+    print(S)
+    print("Result")
+    print(get_optimal_sigma(S,BC))
diff --git a/code/probmodels.py b/code/probmodels.py
index 2bb0705..e561cf6 100644
--- a/code/probmodels.py
+++ b/code/probmodels.py
@@ -1,12 +1,10 @@
-from betabinom import pmf_BetaBinomial
 import pandas as pd
 
 PROB_SEQ_ERROR = 0.001
 BETABINOM_ALPHA = 1.0
 BETABINOM_BETA = 1.0
 
-
-from scipy.stats import binom
+from scipy.stats import betabinom, binom
 import math
 # Calculating the optimal sigma
 
@@ -15,17 +13,17 @@ def log_prob_absent(v,t):
     return prob
 
 def log_prob_present(v,t):
-    prob = math.log(pmf_BetaBinomial(v, t, BETABINOM_ALPHA, BETABINOM_BETA))
+    prob = math.log(betabinom.pmf(v, t, BETABINOM_ALPHA, BETABINOM_BETA))
     return prob
 
 def log_prob_mixed(v,t):
     prob = math.log(0.5 * math.exp(log_prob_absent(v,t)) + (0.5 * math.exp(log_prob_present(v,t))))
-    #print v,t, 'MIXED:', math.exp(prob), math.exp(log_prob_absent(v,t)), math.exp(log_prob_present(v,t)) 
+    #print(v,t, 'MIXED:', math.exp(prob), math.exp(log_prob_absent(v,t)), math.exp(log_prob_present(v,t)))
     return prob
 
 def compute_LL_solution(BC, result, mutations):
-    print BC
-    print result
+    print(BC)
+    print(result)
 
     def helper(x,v,t):
         if x == 0:
diff --git a/code/scarlet.py b/code/scarlet.py
index 6fd4ed9..4ef69c9 100644
--- a/code/scarlet.py
+++ b/code/scarlet.py
@@ -7,9 +7,9 @@
 import sys
 def correct_ternary_matrix(B, S, BC, deletions):
     T = B.copy()
-
+    
     for deletion in deletions:
-        print "  ----", deletion
+        print("  ----", deletion)
         child, mut = deletion
         child = int(child.replace('ANC:', ''))
         # calculate the set of copy-number states affected by the deletion
@@ -23,10 +23,9 @@ def correct_ternary_matrix(B, S, BC, deletions):
 
         # This is the set of states where the mutation has been deleted. Thus it gets set to be
         # a two in the output data
-
+        
         cells = BC[BC['c'].apply(lambda x: x in set_states)].index
-        T[mut] = T.apply(lambda x: x if x.name not in cells else 2 , axis=1)
-
+        T[mut] = T.apply(lambda x: x if x.name not in cells else 2 , axis=1)[mut]
     return T
 
 def main():
@@ -50,7 +49,7 @@ def main():
         Bs[i]=B
         all_deletions += deletions
 
-    print "All DELETIONS", all_deletions
+    print("All DELETIONS", all_deletions)
 
 
     result = assemble_mutation_matrix(Bs, sigmas, BC, mutations)

From 09793949d86187be42151fe1d7f55bf7ed2d04f7 Mon Sep 17 00:00:00 2001
From: Bolladeen <aj7381@princeton.edu>
Date: Wed, 7 Dec 2022 19:55:41 -0500
Subject: [PATCH 2/4] added germline mutation input functionality

---
 code/fileio.py                   | 13 +++++++++----
 code/optimize_mutation_matrix.py | 13 ++++++++-----
 code/optimize_sigma.py           | 25 ++++++++++++++++---------
 code/scarlet.py                  | 18 ++++++++++++------
 4 files changed, 45 insertions(+), 24 deletions(-)

diff --git a/code/fileio.py b/code/fileio.py
index b6e4044..36fbe97 100644
--- a/code/fileio.py
+++ b/code/fileio.py
@@ -1,10 +1,11 @@
 import pandas as pd
 
-def read_in_files(input_file, state_tree_file):
+def read_in_files(input_file, state_tree_file, germline_mutations_file):
     BC = pd.read_csv(input_file, index_col=0)
     S = []
     L = {}
-    mutations = set(sorted([v.split('_')[0] for v in BC.columns[1:]]))
+    SNP = []
+    mutations = set(sorted([v.rsplit('_',1)[0] for v in BC.columns[1:]]))
     BC.index = map(str, BC.index)
 
     with open(state_tree_file) as f:
@@ -16,8 +17,12 @@ def read_in_files(input_file, state_tree_file):
                 L[tuple(edge)] = line_split[2:]
             except:
                 L[tuple(edge)] = []
-
-    return BC, S, L
+    
+    with open(germline_mutations_file) as f:
+        contents = f.readlines()
+        if len(contents) > 0:
+            SNP = contents[0].strip().split(',')
+    return BC, S, L, SNP
 
 def write_out_files(B, B_with_ancestors, T, filename, totalLL):
     B.to_csv('{}.B'.format(filename))
diff --git a/code/optimize_mutation_matrix.py b/code/optimize_mutation_matrix.py
index 7289b6f..6dcd14f 100644
--- a/code/optimize_mutation_matrix.py
+++ b/code/optimize_mutation_matrix.py
@@ -15,6 +15,8 @@ def solve_model(C):
         vals = []
         # Create a new model
         m = Model("mip1")
+        # Add Solver Time Limit
+        m.setParam('TimeLimit', 5*60)
 
         Bs = {}
         # Create mutation matrix
@@ -22,7 +24,7 @@ def solve_model(C):
             for a in C.columns:
                 Bs[(p,a)] = m.addVar(vtype=GRB.BINARY, name="b_{}_{}".format(p,a))
                 vals.append((p,a))
-
+        
         for i,a in enumerate(C.columns):
             for b in C.columns[i+1:]:
 
@@ -79,7 +81,7 @@ def solve_model(C):
         for v in m.getVars():
             if v.varName.startswith('b'):
                 try:
-                    p,a = v.varName.split('_')[1:]
+                    p,a = v.varName.split('_', 2)[1:]
                 except:
                     print(v.varName)
                     raise
@@ -127,6 +129,7 @@ def calculate_C(c, sigmas, DPs, BC):
         C[a] = BC[BC['c']==c].apply(lambda x: calc_c_observed_cell(x['{}_v'.format(a)], \
                x['{}_t'.format(a)]), axis=1)
     
+
     print("MIXED MUTS FOR STATE {}: {}".format(c, mixed_muts))
     def desc_scores(v):
         print(v)
@@ -143,15 +146,15 @@ def desc_scores(v):
         print("----------------- DESCS", descs)
     except KeyError:
         descs=[]
+    
 
     for i,D in enumerate(descs):
         child, desc = D
-        print(child)
+        #print(child)
         d = pd.DataFrame([[desc_scores(desc[a]) for a in mixed_muts]], columns = mixed_muts,\
             index = ['ANC:{}'.format(child)])
         C = C.append(d)
 
-        
     #C = C.reset_index(drop = True)
     return C
         
@@ -259,7 +262,7 @@ def assemble_mutation_matrix_with_ancestors(Bs,sigmas, BC, mutations):
     input_file = "test_data/BC.csv"
     state_tree = "test_data/S.csv"
     BC, S, L = read_in_files(input_file, state_tree)
-    mutations = set(sorted([v.split('_')[0] for v in BC.columns[1:]]))
+    mutations = set(sorted([v.rsplit('_', 1)[0] for v in BC.columns[1:]]))
 
     sigmas = get_optimal_sigma(S,BC)
 
diff --git a/code/optimize_sigma.py b/code/optimize_sigma.py
index 3728fc1..a829d9d 100644
--- a/code/optimize_sigma.py
+++ b/code/optimize_sigma.py
@@ -1,11 +1,11 @@
-
+from itertools import chain
 from probmodels import log_prob_absent, log_prob_present, log_prob_mixed
 import pandas as pd
 
 ###
 #   Calculates the optimal sigma assignments for all mutations given state tree S
 ###
-def get_optimal_sigma(S, BC, L):
+def get_optimal_sigma(S, BC, L, SNP):
     """
     For all mutations calculates the optimal sigma assignment
     S -- edgelist representation of copy-number state tree
@@ -19,11 +19,12 @@ def get_optimal_sigma(S, BC, L):
     copy-number states, and entries in {'Absent', 'Present', 'Mixed'}
 
     """
-    mutations = set(sorted([v.split('_')[0] for v in BC.columns[1:]]))
+    mutations = set(sorted([v.rsplit('_', 1)[0] for v in BC.columns[1:]]))
     C = list(BC['c'])
-    num_states = len(set(C))
+    #num_states = len(set(C))
+    num_states = len(set(chain(*S)))
     subtrees = enum_all_subtrees(num_states, S)
-
+    print(subtrees)
     sigmas = {}
     print(L)
 
@@ -32,15 +33,17 @@ def get_optimal_sigma(S, BC, L):
         max_value = float('-inf')
         max_sigma = None
         max_deletions = None
+
         V = list(BC['{}_v'.format(mutation)])
         T = list(BC['{}_t'.format(mutation)])
+        
         for subtree in subtrees:
 
             status1 = ['Mixed' if i == subtree[0] else 'Present' if i in subtree[1:] else 'Absent' for i in range(num_states)]
-            
             # S is the edge list
             # For every edge, I need to check that if I go from present to absent
             # then there is an allowed loss
+            
             valid_tree = True
             tree_deletions = []
             for edge in S:
@@ -52,18 +55,22 @@ def get_optimal_sigma(S, BC, L):
                         valid_tree = False
                     else:
                         tree_deletions.append(('ANC:{}'.format(t), mutation))
-                    
+            
             if not valid_tree: continue
                 
             p1 = log_prob_sigma(V,T,C, status1)
-
             if p1 > max_value:
                 max_value = p1
                 max_sigma = status1
                 max_deletions = tree_deletions
-    
+        
         deletions += max_deletions
         sigmas[mutation] = max_sigma
+    
+    # Converted all SNP mutation 0 state sigma value to Present to account for Germline Mutations
+    for mutation in SNP:
+        if mutation in sigmas.keys():
+            sigmas[mutation][0] = 'Present'
 
     sigma_new = pd.DataFrame(sigmas)
 
diff --git a/code/scarlet.py b/code/scarlet.py
index 4ef69c9..61ea996 100644
--- a/code/scarlet.py
+++ b/code/scarlet.py
@@ -31,11 +31,16 @@ def correct_ternary_matrix(B, S, BC, deletions):
 def main():
     BC_file = sys.argv[1]
     SL_file = sys.argv[2]
-    output_file = sys.argv[3]
 
-    BC, S, L = read_in_files(BC_file, SL_file)
-    mutations = sorted(set([v.split('_')[0] for v in BC.columns[1:]]))
-    sigmas, dels = get_optimal_sigma(S,BC,L)
+    #TODO: Add input file containing list of germline mutations (SNPs)
+    SNP_file = sys.argv[3]
+    
+    output_file = sys.argv[4]
+
+    BC, S, L, SNP = read_in_files(BC_file, SL_file, SNP_file)
+    mutations = sorted(set([v.rsplit('_', 1)[0] for v in BC.columns[1:]]))
+    sigmas, dels = get_optimal_sigma(S,BC,L,SNP)
+    
     DPs = get_descendent_profiles(sigmas, mutations, S, L)
 
     all_deletions = dels
@@ -44,13 +49,14 @@ def main():
     for i in cn_states:
 
         C= calculate_C(i, sigmas, DPs, BC)
-
+        
         B, deletions = solve_model(C)
+
         Bs[i]=B
         all_deletions += deletions
 
     print("All DELETIONS", all_deletions)
-
+    
 
     result = assemble_mutation_matrix(Bs, sigmas, BC, mutations)
     result_with_ancestors = assemble_mutation_matrix_with_ancestors(Bs, sigmas, BC, mutations)

From aad0952cea6f8b154fd54ed892b1380c5bb3760e Mon Sep 17 00:00:00 2001
From: Bolladeen <aj7381@princeton.edu>
Date: Tue, 13 Dec 2022 18:17:59 -0500
Subject: [PATCH 3/4] changed optimizer time limit to 4 hours

---
 code/optimize_mutation_matrix.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/code/optimize_mutation_matrix.py b/code/optimize_mutation_matrix.py
index 6dcd14f..a88bcde 100644
--- a/code/optimize_mutation_matrix.py
+++ b/code/optimize_mutation_matrix.py
@@ -16,7 +16,7 @@ def solve_model(C):
         # Create a new model
         m = Model("mip1")
         # Add Solver Time Limit
-        m.setParam('TimeLimit', 5*60)
+        m.setParam('TimeLimit', 5*60*60)
 
         Bs = {}
         # Create mutation matrix

From fd748ad21d82d164d8b76348e5174f8bf3cb2923 Mon Sep 17 00:00:00 2001
From: Bolladeen <aj7381@princeton.edu>
Date: Tue, 21 Feb 2023 13:40:07 -0500
Subject: [PATCH 4/4] aligned scarlet error model to CONDOR

---
 code/optimize_mutation_matrix.py | 30 ++++++++++++++++++++++--------
 code/optimize_sigma.py           |  1 +
 code/probmodels.py               |  9 +++++----
 code/scarlet.py                  |  8 +++++---
 4 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/code/optimize_mutation_matrix.py b/code/optimize_mutation_matrix.py
index a88bcde..3e25587 100644
--- a/code/optimize_mutation_matrix.py
+++ b/code/optimize_mutation_matrix.py
@@ -72,16 +72,34 @@ def solve_model(C):
 
         # Set objective
         objn = sum(Bs[(p,a)]*C.loc[p][a] for p,a in vals)
-
         m.setObjective(objn, GRB.MAXIMIZE)
 
         m.optimize()
 
         B = C.copy()
+
         for v in m.getVars():
             if v.varName.startswith('b'):
                 try:
-                    p,a = v.varName.split('_', 2)[1:]
+                    ''' 
+                    if v.varName.startswith('b_ANC:'):                                                                                                   
+                        p = '_'.join(v.varName.split('_')[1:3])                                                                                          
+                        a = '_'.join(v.varName.split('_')[3:])                                                                                           
+                     else:                                                                                                                                
+                        p = '_'.join(v.varName.split('_')[1:4])                                                                                          
+                        a = '_'.join(v.varName.split('_')[4:])       
+                    '''
+                    if v.varName.startswith('b_ANC:'):
+                        filter_var = v.varName.split(':')[1]
+                        p = '_'.join(filter_var.split('_')[0:1])
+                        a = '_'.join(filter_var.split('_')[1:])
+                        if p.isdigit():
+                            p = B.index[int(p)]
+                    else:
+                        filter_var = v.varName.split('_',1)[1]
+                        p = '_'.join(filter_var.split('_')[0:3])
+                        a = '_'.join(filter_var.split('_')[3:]) 
+                        #p,a = v.varName.split('_')[1:]
                 except:
                     print(v.varName)
                     raise
@@ -147,12 +165,11 @@ def desc_scores(v):
     except KeyError:
         descs=[]
     
-
     for i,D in enumerate(descs):
         child, desc = D
         #print(child)
         d = pd.DataFrame([[desc_scores(desc[a]) for a in mixed_muts]], columns = mixed_muts,\
-            index = ['ANC:{}'.format(child)])
+                index = ['ANC:{}'.format(child)])
         C = C.append(d)
 
     #C = C.reset_index(drop = True)
@@ -182,8 +199,6 @@ def get_descendent_profiles(sigmas, mutations, S, L):
         if parent not in DPs:
             DPs[parent] = []
         DPs[parent].append((child, descendent_profile)) 
-        #print("DPS")
-        #print(child, descendent_profile)
 
     return DPs 
 
@@ -263,7 +278,6 @@ def assemble_mutation_matrix_with_ancestors(Bs,sigmas, BC, mutations):
     state_tree = "test_data/S.csv"
     BC, S, L = read_in_files(input_file, state_tree)
     mutations = set(sorted([v.rsplit('_', 1)[0] for v in BC.columns[1:]]))
-
     sigmas = get_optimal_sigma(S,BC)
 
     DPs = get_descendent_profiles(sigmas, mutations)
@@ -272,5 +286,5 @@ def assemble_mutation_matrix_with_ancestors(Bs,sigmas, BC, mutations):
     C = calculate_C(1, sigmas, DPs)
 
     B = solve_model(C)
-
+    
     #print(B)
diff --git a/code/optimize_sigma.py b/code/optimize_sigma.py
index a829d9d..bba29a0 100644
--- a/code/optimize_sigma.py
+++ b/code/optimize_sigma.py
@@ -103,6 +103,7 @@ def log_prob_sigma(V,T,C,sigma):
         # print('-------')
         v,t,c = R
 
+
         status = sigma[c]
         if status == 'Absent':
             log_prob += log_prob_absent(v,t)
diff --git a/code/probmodels.py b/code/probmodels.py
index e561cf6..1efd587 100644
--- a/code/probmodels.py
+++ b/code/probmodels.py
@@ -1,19 +1,20 @@
 import pandas as pd
 
 PROB_SEQ_ERROR = 0.001
-BETABINOM_ALPHA = 1.0
-BETABINOM_BETA = 1.0
+ADO_PRECISION = 15
+BETABINOM_ALPHA = PROB_SEQ_ERROR * ADO_PRECISION
+BETABINOM_BETA = (1 - PROB_SEQ_ERROR) * ADO_PRECISION
 
 from scipy.stats import betabinom, binom
 import math
 # Calculating the optimal sigma
 
 def log_prob_absent(v,t):
-    prob = binom.logpmf(v,t,PROB_SEQ_ERROR)
+    prob = betabinom.logpmf(v,t, BETABINOM_ALPHA, BETABINOM_BETA)
     return prob
 
 def log_prob_present(v,t):
-    prob = math.log(betabinom.pmf(v, t, BETABINOM_ALPHA, BETABINOM_BETA))
+    prob = betabinom.logpmf(v, t, 1, 1)
     return prob
 
 def log_prob_mixed(v,t):
diff --git a/code/scarlet.py b/code/scarlet.py
index 61ea996..43e5ab7 100644
--- a/code/scarlet.py
+++ b/code/scarlet.py
@@ -11,6 +11,7 @@ def correct_ternary_matrix(B, S, BC, deletions):
     for deletion in deletions:
         print("  ----", deletion)
         child, mut = deletion
+        #print('DELETION: ', deletion, child, mut)
         child = int(child.replace('ANC:', ''))
         # calculate the set of copy-number states affected by the deletion
         # not the most efficient way to do this but S should be small
@@ -31,23 +32,24 @@ def correct_ternary_matrix(B, S, BC, deletions):
 def main():
     BC_file = sys.argv[1]
     SL_file = sys.argv[2]
-
+    
     #TODO: Add input file containing list of germline mutations (SNPs)
     SNP_file = sys.argv[3]
     
     output_file = sys.argv[4]
 
     BC, S, L, SNP = read_in_files(BC_file, SL_file, SNP_file)
+    
     mutations = sorted(set([v.rsplit('_', 1)[0] for v in BC.columns[1:]]))
     sigmas, dels = get_optimal_sigma(S,BC,L,SNP)
-    
     DPs = get_descendent_profiles(sigmas, mutations, S, L)
 
     all_deletions = dels
     cn_states = BC['c'].unique()
     Bs = {}
+    
     for i in cn_states:
-
+        
         C= calculate_C(i, sigmas, DPs, BC)
         
         B, deletions = solve_model(C)