From 945c57177794c2c947517c49f8e1883a0b898ac0 Mon Sep 17 00:00:00 2001
From: zbohannan <zbohannan@gmail.com>
Date: Tue, 20 Nov 2018 11:14:34 -0600
Subject: [PATCH 01/20] Change fn sigs

---
 edgePy/DGEList.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py
index b345d14..1fa3138 100644
--- a/edgePy/DGEList.py
+++ b/edgePy/DGEList.py
@@ -400,7 +400,7 @@ def get_gene_mask_and_lengths(self, gene_data):
 
     def tpm(
         self,
-        gene_lengths: np.ndarray,
+        gene_data: CanonicalDataStore,
         transform_to_log: bool = False,
         prior_count: float = PRIOR_COUNT,
         mean_fragment_lengths: np.ndarray = None,
@@ -419,9 +419,9 @@ def tpm(
            \\left(\\frac{1}{\sum_j \\frac{X_j}{\widetilde{l_j}}}\\right) \cdot 10^6
 
         Args:
-            gene_lengths: 1D array of gene lengths for each gene in the rows of `DGEList.counts`.
+            gene_data: An object that works to import Ensembl based data, for use in calculations
             transform_to_log: store log outputs
-            prior_count:
+            prior_count: a minimum value for genes, if you do log transforms
             mean_fragment_lengths: 1D array of mean fragment lengths for each sample in the columns of `DGEList.counts`
                 (optional)
 
@@ -430,10 +430,10 @@ def tpm(
         # compute effective length not allowing negative lengths
         if mean_fragment_lengths:
             effective_lengths = (
-                gene_lengths[:, np.newaxis] - mean_fragment_lengths[np.newaxis, :]
+                gene_data[:, np.newaxis] - mean_fragment_lengths[np.newaxis, :]
             ).clip(min=1)
         else:
-            effective_lengths = gene_lengths[:, np.newaxis]
+            effective_lengths = gene_data[:, np.newaxis]
 
         # how many counts per base
         base_counts = self.counts / effective_lengths

From a2fdfb086f4a615cb778dd5b217402e9dec120ff Mon Sep 17 00:00:00 2001
From: zbohannan <zbohannan@gmail.com>
Date: Sat, 24 Nov 2018 14:07:01 -0600
Subject: [PATCH 02/20] Harmonize TPM RPKM

---
 edgePy/DGEList.py | 49 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 45 insertions(+), 4 deletions(-)

diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py
index 1fa3138..f708cdb 100644
--- a/edgePy/DGEList.py
+++ b/edgePy/DGEList.py
@@ -357,6 +357,46 @@ def rpkm(
 
         return self.copy(counts=counts, current_log=current_log, genes=genes)
 
+    def tpm(
+        self,
+        gene_data: CanonicalDataStore,
+        transform_to_log: bool = False,
+        prior_count: float = PRIOR_COUNT,
+    ) -> "DGEList":
+        """Return the DGEList normalized to transcripts per million reads.
+        TPM =   countsPerBase * ( 1/sum[countsPerBase]) * 10^6
+
+        Args:
+            gene_data: An object that works to import Ensembl based data, for use in calculations
+            transform_to_log: true, if you wish to convert to log after converting to RPKM
+            prior_count: a minimum value for genes, if you do log transforms.
+        """
+        current_log = self.current_log_status
+
+        #checks current log status and converts if currently in log format
+        if self.current_log_status:
+            self.counts = np.exp(self.counts)
+            current_log = False
+
+        #returns canonical gene length and gene mask
+        gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data)
+
+        genes = self.genes[gene_mask].copy()
+        counts = self.counts[gene_mask].copy()
+
+        #calculates counts per base (ie, the rate)
+        rate = (counts.T / gene_len_ordered).T
+
+        #calculates tpm
+        counts = (rate / np.sum(rate)) * 1e6
+
+        if transform_to_log:
+            counts = self.log_transform(counts, prior_count)
+            current_log = True
+
+        return self.copy(counts=counts, current_log=current_log, genes=genes)
+
+
     def get_gene_mask_and_lengths(self, gene_data):
 
         """
@@ -398,9 +438,10 @@ def get_gene_mask_and_lengths(self, gene_data):
                     gene_mask.append(False)
         return gene_len_ordered, gene_mask
 
+'''Commented out for now. May reuse later, depending on approach to user-defined DGELists
     def tpm(
         self,
-        gene_data: CanonicalDataStore,
+        gene_lengths: np.ndarray,
         transform_to_log: bool = False,
         prior_count: float = PRIOR_COUNT,
         mean_fragment_lengths: np.ndarray = None,
@@ -430,10 +471,10 @@ def tpm(
         # compute effective length not allowing negative lengths
         if mean_fragment_lengths:
             effective_lengths = (
-                gene_data[:, np.newaxis] - mean_fragment_lengths[np.newaxis, :]
+                gene_lengths[:, np.newaxis] - mean_fragment_lengths[np.newaxis, :]
             ).clip(min=1)
         else:
-            effective_lengths = gene_data[:, np.newaxis]
+            effective_lengths = gene_lengths[:, np.newaxis]
 
         # how many counts per base
         base_counts = self.counts / effective_lengths
@@ -445,7 +486,7 @@ def tpm(
             current_log = True
 
         return self.copy(counts=counts, current_log=current_log)
-
+'''
     def __repr__(self) -> str:
         """Give a pretty non-executeable representation of this object."""
         num_samples = len(self._samples) if self._samples is not None else 0

From 53bff6919b67862599614022f8be9d5e69647ee3 Mon Sep 17 00:00:00 2001
From: zbohannan <zbohannan@gmail.com>
Date: Sat, 1 Dec 2018 14:53:57 -0600
Subject: [PATCH 03/20] Start work on customdatastore

---
 edgePy/data_import/user_defined.py | 57 ++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 edgePy/data_import/user_defined.py

diff --git a/edgePy/data_import/user_defined.py b/edgePy/data_import/user_defined.py
new file mode 100644
index 0000000..7ad8ce2
--- /dev/null
+++ b/edgePy/data_import/user_defined.py
@@ -0,0 +1,57 @@
+import numpy as np
+
+class CustomDataStore(object):
+    """
+    A data structure allowing user-defined gene data; analogous to CanonicalDataStore generated from Ensembl data.
+
+    Args:
+        genes: list of gene names
+        transcripts: list of transcript names
+        lengths: length of transcripts
+        canonical: defines whether transcript is canonical or not
+        symbols: list of gene symbols
+    """
+
+    def __init__(
+        self,
+        genes: Optional[np.array] = None,
+        transcripts: Optional[np.ndarray] = None,
+        lengths: Optional[np.ndarray] = None,
+        canonical: Optional[np.ndarray] = None,
+        symbols: Optional[np.ndarray] = None
+    ) -> None:
+
+        if genes or transcripts or lengths or symbols is None:
+            raise Exception("All DataStore components must be provided at init")
+        else:
+            if genes.size != transcripts.size:
+                raise Exception("Gene array and transcript array are not the same size")
+            if genes.size != lengths.size:
+                raise Exception("Gene array and length array are not the same size")
+            if genes.size != symbols.size:
+                raise Exception("Gene array and symbol array are not the same size")
+            if canonical is None:
+                for i in genes:
+                    canonical[i] = True
+                log.info(f"No canonical status input. All input lengths assumed to be canonical.")
+
+            self.genes = genes
+            self.transcripts = transcripts
+            self.lengths = lengths
+            self.canonical = canonical
+            self.symbols = symbols
+
+    @classmethod
+    def create_CustomDataStore(
+            cls,
+            gene_list: List[str],
+            transcript_list: List[str],
+            length_list: List[int],
+            is_canonical: List[bool],
+            symbols_list: List[str]
+    ) -> "CustomDataStore" :
+        """
+        Creates a CustomDataStore for use with DGElist operations. Alternative for data lacking Ensembl files.
+        """
+
+        log.info(f"Creating CustomDataStore object using user-defined data")
\ No newline at end of file

From 17217234b0270dd046b42737e62e53709eab9d72 Mon Sep 17 00:00:00 2001
From: zbohannan <zbohannan@gmail.com>
Date: Tue, 11 Dec 2018 14:02:51 -0600
Subject: [PATCH 04/20] TPM fn sig harmonized to RPKM

---
 edgePy/DGEList.py                  |  6 ++--
 edgePy/data_import/user_defined.py | 57 ------------------------------
 tests/test_DGEList.py              | 18 +++++++++-
 3 files changed, 21 insertions(+), 60 deletions(-)
 delete mode 100644 edgePy/data_import/user_defined.py

diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py
index f708cdb..a2f84ad 100644
--- a/edgePy/DGEList.py
+++ b/edgePy/DGEList.py
@@ -365,6 +365,8 @@ def tpm(
     ) -> "DGEList":
         """Return the DGEList normalized to transcripts per million reads.
         TPM =   countsPerBase * ( 1/sum[countsPerBase]) * 10^6
+        From Harold Pimentel; "What is FPKM? A review of RNA-Seq expression units"
+        https://haroldpimentel.wordpress.com/2014/05/08/what-the-fpkm-a-review-rna-seq-expression-units/
 
         Args:
             gene_data: An object that works to import Ensembl based data, for use in calculations
@@ -438,7 +440,7 @@ def get_gene_mask_and_lengths(self, gene_data):
                     gene_mask.append(False)
         return gene_len_ordered, gene_mask
 
-'''Commented out for now. May reuse later, depending on approach to user-defined DGELists
+    '''Commented out for now. May reuse later, depending on approach to user-defined DGELists
     def tpm(
         self,
         gene_lengths: np.ndarray,
@@ -486,7 +488,7 @@ def tpm(
             current_log = True
 
         return self.copy(counts=counts, current_log=current_log)
-'''
+    '''
     def __repr__(self) -> str:
         """Give a pretty non-executeable representation of this object."""
         num_samples = len(self._samples) if self._samples is not None else 0
diff --git a/edgePy/data_import/user_defined.py b/edgePy/data_import/user_defined.py
deleted file mode 100644
index 7ad8ce2..0000000
--- a/edgePy/data_import/user_defined.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import numpy as np
-
-class CustomDataStore(object):
-    """
-    A data structure allowing user-defined gene data; analogous to CanonicalDataStore generated from Ensembl data.
-
-    Args:
-        genes: list of gene names
-        transcripts: list of transcript names
-        lengths: length of transcripts
-        canonical: defines whether transcript is canonical or not
-        symbols: list of gene symbols
-    """
-
-    def __init__(
-        self,
-        genes: Optional[np.array] = None,
-        transcripts: Optional[np.ndarray] = None,
-        lengths: Optional[np.ndarray] = None,
-        canonical: Optional[np.ndarray] = None,
-        symbols: Optional[np.ndarray] = None
-    ) -> None:
-
-        if genes or transcripts or lengths or symbols is None:
-            raise Exception("All DataStore components must be provided at init")
-        else:
-            if genes.size != transcripts.size:
-                raise Exception("Gene array and transcript array are not the same size")
-            if genes.size != lengths.size:
-                raise Exception("Gene array and length array are not the same size")
-            if genes.size != symbols.size:
-                raise Exception("Gene array and symbol array are not the same size")
-            if canonical is None:
-                for i in genes:
-                    canonical[i] = True
-                log.info(f"No canonical status input. All input lengths assumed to be canonical.")
-
-            self.genes = genes
-            self.transcripts = transcripts
-            self.lengths = lengths
-            self.canonical = canonical
-            self.symbols = symbols
-
-    @classmethod
-    def create_CustomDataStore(
-            cls,
-            gene_list: List[str],
-            transcript_list: List[str],
-            length_list: List[int],
-            is_canonical: List[bool],
-            symbols_list: List[str]
-    ) -> "CustomDataStore" :
-        """
-        Creates a CustomDataStore for use with DGElist operations. Alternative for data lacking Ensembl files.
-        """
-
-        log.info(f"Creating CustomDataStore object using user-defined data")
\ No newline at end of file
diff --git a/tests/test_DGEList.py b/tests/test_DGEList.py
index bf208bc..3bdaefd 100644
--- a/tests/test_DGEList.py
+++ b/tests/test_DGEList.py
@@ -213,7 +213,23 @@ def test_rpkm():
     # RPKM=numReads / (geneLength / 1000 * totalNumReads / 1, 000, 000)
     assert rpm_dge.counts[0][0] == (first_pos / ((gene_len / 1e3) * (col_sum[0] / 1e6)))
 
+def test_tpm():
+    dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ)))
+    icd = CanonicalDataStore(
+        get_dataset_path(TEST_GENE_SET_DATA), get_dataset_path(TEST_GENE_SYMBOLS)
+    )
+    first_pos = dge_list.counts[0][0]
+    first_gene = dge_list.genes[0]
 
+    col_sum = np.sum(dge_list.counts, axis=0)
+    assert isinstance(first_pos, np.integer)
+    tpm_dge = dge_list.tpm(icd)
+    ensg_gene = icd.pick_gene_id(icd.get_genes_from_symbol(first_gene))
+    gene_len = icd.get_length_of_canonical_transcript(ensg_gene)
+
+    # TPM=countsPerBase * (1/sum[countsPerBase]) * 10^6
+    assert tpm_dge.counts[0][0] == (first_pos / col_sum[0]) * 1e6
+'''
 def test_tpm():
     # example hand calculated as in https://www.youtube.com/watch?time_continue=611&v=TTUrtCY2k-w
     counts = np.array([[10, 12, 30], [20, 25, 60], [5, 8, 15], [0, 0, 1]])
@@ -242,7 +258,7 @@ def test_tpm():
     # make sure that the sums of all genes across are the same the each sample (an important property of TPM)
     gene_sums = new_dge_list.counts.sum(axis=0)
     assert np.allclose(gene_sums, [gene_sums[0]] * len(gene_sums))
-
+'''
 
 # Unit tests for ``edgePy.data_import.Importer``.\
 def test_init():

From b925fa775208a1230f96527fdf276cd22a75f7fe Mon Sep 17 00:00:00 2001
From: zbohannan <zbohannan@gmail.com>
Date: Wed, 12 Dec 2018 10:28:23 -0600
Subject: [PATCH 05/20] Address review issues

---
 edgePy/DGEList.py     | 89 ++++++++++++-------------------------------
 tests/test_DGEList.py | 37 +++---------------
 2 files changed, 31 insertions(+), 95 deletions(-)

diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py
index a2f84ad..f979944 100644
--- a/edgePy/DGEList.py
+++ b/edgePy/DGEList.py
@@ -357,48 +357,6 @@ def rpkm(
 
         return self.copy(counts=counts, current_log=current_log, genes=genes)
 
-    def tpm(
-        self,
-        gene_data: CanonicalDataStore,
-        transform_to_log: bool = False,
-        prior_count: float = PRIOR_COUNT,
-    ) -> "DGEList":
-        """Return the DGEList normalized to transcripts per million reads.
-        TPM =   countsPerBase * ( 1/sum[countsPerBase]) * 10^6
-        From Harold Pimentel; "What is FPKM? A review of RNA-Seq expression units"
-        https://haroldpimentel.wordpress.com/2014/05/08/what-the-fpkm-a-review-rna-seq-expression-units/
-
-        Args:
-            gene_data: An object that works to import Ensembl based data, for use in calculations
-            transform_to_log: true, if you wish to convert to log after converting to RPKM
-            prior_count: a minimum value for genes, if you do log transforms.
-        """
-        current_log = self.current_log_status
-
-        #checks current log status and converts if currently in log format
-        if self.current_log_status:
-            self.counts = np.exp(self.counts)
-            current_log = False
-
-        #returns canonical gene length and gene mask
-        gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data)
-
-        genes = self.genes[gene_mask].copy()
-        counts = self.counts[gene_mask].copy()
-
-        #calculates counts per base (ie, the rate)
-        rate = (counts.T / gene_len_ordered).T
-
-        #calculates tpm
-        counts = (rate / np.sum(rate)) * 1e6
-
-        if transform_to_log:
-            counts = self.log_transform(counts, prior_count)
-            current_log = True
-
-        return self.copy(counts=counts, current_log=current_log, genes=genes)
-
-
     def get_gene_mask_and_lengths(self, gene_data):
 
         """
@@ -440,18 +398,18 @@ def get_gene_mask_and_lengths(self, gene_data):
                     gene_mask.append(False)
         return gene_len_ordered, gene_mask
 
-    '''Commented out for now. May reuse later, depending on approach to user-defined DGELists
     def tpm(
         self,
-        gene_lengths: np.ndarray,
+        gene_data: CanonicalDataStore,
         transform_to_log: bool = False,
         prior_count: float = PRIOR_COUNT,
-        mean_fragment_lengths: np.ndarray = None,
     ) -> "DGEList":
         """Normalize the DGEList to transcripts per million.
 
-        Adapted from Wagner, et al. 'Measurement of mRNA abundance using RNA-seq data:
-        RPKM measure is inconsistent among samples.' doi:10.1007/s12064-012-0162-3
+        Adapted from Li et al. "RNA-Seq gene expression estimation with read mapping uncertainty."
+        Bioinformatics, Volume 26, Issue 4, 15 February 2010, Pages 493–500,
+        https://doi.org/10.1093/bioinformatics/btp692
+
 
         Read counts :math:`X_i` (for each gene :math:`i` with gene length :math:`\widetilde{l_j}` )
         are normalized as follows:
@@ -463,32 +421,35 @@ def tpm(
 
         Args:
             gene_data: An object that works to import Ensembl based data, for use in calculations
-            transform_to_log: store log outputs
-            prior_count: a minimum value for genes, if you do log transforms
-            mean_fragment_lengths: 1D array of mean fragment lengths for each sample in the columns of `DGEList.counts`
-                (optional)
+            transform_to_log: true, if you wish to convert to log after converting to RPKM
+            prior_count: a minimum value for genes, if you do log transforms.
 
         """
+        current_log = self.current_log_status
 
-        # compute effective length not allowing negative lengths
-        if mean_fragment_lengths:
-            effective_lengths = (
-                gene_lengths[:, np.newaxis] - mean_fragment_lengths[np.newaxis, :]
-            ).clip(min=1)
-        else:
-            effective_lengths = gene_lengths[:, np.newaxis]
+        # checks current log status and converts if currently in log format
+        if self.current_log_status:
+            self.counts = np.exp(self.counts)
+            current_log = False
 
-        # how many counts per base
-        base_counts = self.counts / effective_lengths
+        # returns canonical gene length and gene mask
+        gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data)
+
+        genes = self.genes[gene_mask].copy()
+        counts = self.counts[gene_mask].copy()
+
+        # calculates counts per base (ie, the rate)
+        rate = (counts.T / gene_len_ordered).T
+
+        # calculates tpm
+        counts = (rate / np.sum(rate)) * 1e6
 
-        counts = 1e6 * base_counts / np.sum(base_counts, axis=0)[np.newaxis, :]
-        current_log = self.current_log_status
         if transform_to_log:
             counts = self.log_transform(counts, prior_count)
             current_log = True
 
-        return self.copy(counts=counts, current_log=current_log)
-    '''
+        return self.copy(counts=counts, current_log=current_log, genes=genes)
+
     def __repr__(self) -> str:
         """Give a pretty non-executeable representation of this object."""
         num_samples = len(self._samples) if self._samples is not None else 0
diff --git a/tests/test_DGEList.py b/tests/test_DGEList.py
index 3bdaefd..f30b61b 100644
--- a/tests/test_DGEList.py
+++ b/tests/test_DGEList.py
@@ -221,44 +221,19 @@ def test_tpm():
     first_pos = dge_list.counts[0][0]
     first_gene = dge_list.genes[0]
 
-    col_sum = np.sum(dge_list.counts, axis=0)
+    gene_len_ordered, gene_mask = dge_list.get_gene_mask_and_lengths(icd)
+
+    rate = (dge_list.counts.T / gene_len_ordered).T
+    rate_sum = np.sum(rate)
+
     assert isinstance(first_pos, np.integer)
     tpm_dge = dge_list.tpm(icd)
     ensg_gene = icd.pick_gene_id(icd.get_genes_from_symbol(first_gene))
     gene_len = icd.get_length_of_canonical_transcript(ensg_gene)
 
     # TPM=countsPerBase * (1/sum[countsPerBase]) * 10^6
-    assert tpm_dge.counts[0][0] == (first_pos / col_sum[0]) * 1e6
-'''
-def test_tpm():
-    # example hand calculated as in https://www.youtube.com/watch?time_continue=611&v=TTUrtCY2k-w
-    counts = np.array([[10, 12, 30], [20, 25, 60], [5, 8, 15], [0, 0, 1]])
-    gene_lengths = np.array([2000, 4000, 1000, 10000])
-
-    expected = np.array(
-        [
-            [333_333.333_333_33, 296_296.296_296_3, 332_594.235_033_26],
-            [333_333.333_333_33, 308_641.975_308_64, 332_594.235_033_26],
-            [333_333.333_333_33, 395_061.728_395_06, 332_594.235_033_26],
-            [0.0, 0.0, 2217.294_900_22],
-        ]
-    )
-
-    dge_list = DGEList(
-        counts=counts,
-        samples=np.array(['a', 'b', 'c']),
-        genes=np.array(['a', 'b', 'c', 'd']),
-        groups_in_dict={'group1': ['a', 'c'], 'group2': ['b', 'd']},
-    )
-    assert isinstance(dge_list.counts[0][0], np.integer)
-    new_dge_list = dge_list.tpm(gene_lengths)
-
-    assert np.allclose(new_dge_list.counts, expected, atol=1e-1)
+    assert tpm_dge.counts[0][0] == ((first_pos / gene_len) / rate_sum) * 1e6
 
-    # make sure that the sums of all genes across are the same the each sample (an important property of TPM)
-    gene_sums = new_dge_list.counts.sum(axis=0)
-    assert np.allclose(gene_sums, [gene_sums[0]] * len(gene_sums))
-'''
 
 # Unit tests for ``edgePy.data_import.Importer``.\
 def test_init():

From 7008d8b1bfe7dc02c6e8d3fc2783c07aea26d7e0 Mon Sep 17 00:00:00 2001
From: zbohannan <zbohannan@gmail.com>
Date: Wed, 12 Dec 2018 11:14:30 -0600
Subject: [PATCH 06/20] Created new rate fn for TPM and test

---
 edgePy/DGEList.py     | 33 +++++++++++++++++++++++----------
 tests/test_DGEList.py |  5 +++--
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py
index f979944..1888d88 100644
--- a/edgePy/DGEList.py
+++ b/edgePy/DGEList.py
@@ -421,7 +421,7 @@ def tpm(
 
         Args:
             gene_data: An object that works to import Ensembl based data, for use in calculations
-            transform_to_log: true, if you wish to convert to log after converting to RPKM
+            transform_to_log: true, if you wish to convert to log after converting to TPM
             prior_count: a minimum value for genes, if you do log transforms.
 
         """
@@ -432,17 +432,11 @@ def tpm(
             self.counts = np.exp(self.counts)
             current_log = False
 
-        # returns canonical gene length and gene mask
-        gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data)
-
-        genes = self.genes[gene_mask].copy()
-        counts = self.counts[gene_mask].copy()
-
-        # calculates counts per base (ie, the rate)
-        rate = (counts.T / gene_len_ordered).T
+        rates = self.get_rates(gene_data)
+        rate_sum = np.sum(rates)
 
         # calculates tpm
-        counts = (rate / np.sum(rate)) * 1e6
+        counts = (rates / rate_sum) * 1e6
 
         if transform_to_log:
             counts = self.log_transform(counts, prior_count)
@@ -450,6 +444,25 @@ def tpm(
 
         return self.copy(counts=counts, current_log=current_log, genes=genes)
 
+    def get_rates(
+        self,
+        gene_data: CanonicalDataStore
+        ) -> "DGEList":
+        """Gets the number of counts per base, otherwise known as the rate, for all genes.
+        Currently used for TPM calculations.
+
+        Args:
+            gene_data: An object that works to import Ensembl based data, for use in calculations
+        """
+
+        gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data)
+        counts = self.counts[gene_mask].copy()
+
+        # calculates counts per base (ie, the rate)
+        rates = (counts.T / gene_len_ordered).T
+
+        return rates
+
     def __repr__(self) -> str:
         """Give a pretty non-executeable representation of this object."""
         num_samples = len(self._samples) if self._samples is not None else 0
diff --git a/tests/test_DGEList.py b/tests/test_DGEList.py
index f30b61b..4078d24 100644
--- a/tests/test_DGEList.py
+++ b/tests/test_DGEList.py
@@ -223,8 +223,9 @@ def test_tpm():
 
     gene_len_ordered, gene_mask = dge_list.get_gene_mask_and_lengths(icd)
 
-    rate = (dge_list.counts.T / gene_len_ordered).T
-    rate_sum = np.sum(rate)
+    rates = dge_list.get_rates(icd)
+
+    rate_sum = np.sum(rates)
 
     assert isinstance(first_pos, np.integer)
     tpm_dge = dge_list.tpm(icd)

From 55c7e2655dfae0d23ff02d6efac2a4a6c3c7032b Mon Sep 17 00:00:00 2001
From: zbohannan <zbohannan@gmail.com>
Date: Wed, 12 Dec 2018 11:24:49 -0600
Subject: [PATCH 07/20] Minor cleanup

---
 edgePy/DGEList.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py
index 1888d88..7407376 100644
--- a/edgePy/DGEList.py
+++ b/edgePy/DGEList.py
@@ -432,6 +432,7 @@ def tpm(
             self.counts = np.exp(self.counts)
             current_log = False
 
+        genes = self.genes[gene_mask].copy()
         rates = self.get_rates(gene_data)
         rate_sum = np.sum(rates)
 

From 91216648fa3c54b052a020d9752d8ea9cb69a58f Mon Sep 17 00:00:00 2001
From: zbohannan <zbohannan@gmail.com>
Date: Wed, 12 Dec 2018 11:29:17 -0600
Subject: [PATCH 08/20] Fix minor errors

---
 edgePy/DGEList.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py
index 7407376..ef98607 100644
--- a/edgePy/DGEList.py
+++ b/edgePy/DGEList.py
@@ -432,7 +432,10 @@ def tpm(
             self.counts = np.exp(self.counts)
             current_log = False
 
+        gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data)
+
         genes = self.genes[gene_mask].copy()
+        
         rates = self.get_rates(gene_data)
         rate_sum = np.sum(rates)
 

From eba0419df7e60620c8cc8a841550196bf68ea51b Mon Sep 17 00:00:00 2001
From: zbohannan <zbohannan@gmail.com>
Date: Wed, 12 Dec 2018 11:57:52 -0600
Subject: [PATCH 09/20] Fix PR issues, add fn to get counts per base

---
 edgePy/DGEList.py     | 2 +-
 tests/test_DGEList.py | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py
index ef98607..bd2c072 100644
--- a/edgePy/DGEList.py
+++ b/edgePy/DGEList.py
@@ -435,7 +435,7 @@ def tpm(
         gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data)
 
         genes = self.genes[gene_mask].copy()
-        
+
         rates = self.get_rates(gene_data)
         rate_sum = np.sum(rates)
 
diff --git a/tests/test_DGEList.py b/tests/test_DGEList.py
index 4078d24..a581b11 100644
--- a/tests/test_DGEList.py
+++ b/tests/test_DGEList.py
@@ -221,8 +221,6 @@ def test_tpm():
     first_pos = dge_list.counts[0][0]
     first_gene = dge_list.genes[0]
 
-    gene_len_ordered, gene_mask = dge_list.get_gene_mask_and_lengths(icd)
-
     rates = dge_list.get_rates(icd)
 
     rate_sum = np.sum(rates)
@@ -232,7 +230,7 @@ def test_tpm():
     ensg_gene = icd.pick_gene_id(icd.get_genes_from_symbol(first_gene))
     gene_len = icd.get_length_of_canonical_transcript(ensg_gene)
 
-    # TPM=countsPerBase * (1/sum[countsPerBase]) * 10^6
+    # TPM = countsPerBase / sum[countsPerBase]) * 10^6
     assert tpm_dge.counts[0][0] == ((first_pos / gene_len) / rate_sum) * 1e6
 
 

From 4f88681c78e529aa4e8fd809e39efc9a23a91037 Mon Sep 17 00:00:00 2001
From: zbohannan <zbohannan@gmail.com>
Date: Sat, 15 Dec 2018 20:53:28 -0600
Subject: [PATCH 10/20] Add kb->b conversion

---
 edgePy/DGEList.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py
index bd2c072..1c6aacd 100644
--- a/edgePy/DGEList.py
+++ b/edgePy/DGEList.py
@@ -360,7 +360,7 @@ def rpkm(
     def get_gene_mask_and_lengths(self, gene_data):
 
         """
-        use gene_data to get the gene lenths and a gene mask for the tranformation.
+        use gene_data to get the gene lengths and a gene mask for the tranformation.
         Args:
             gene_data: the object that holds gene data from ensembl
 
@@ -462,6 +462,8 @@ def get_rates(
         gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data)
         counts = self.counts[gene_mask].copy()
 
+        gene_len_ordered = gene_len_ordered * 1e3
+
         # calculates counts per base (ie, the rate)
         rates = (counts.T / gene_len_ordered).T
 

From 066f915c0629bf3c1ee25d54510a11a7efe4ca19 Mon Sep 17 00:00:00 2001
From: zbohannan <zbohannan@gmail.com>
Date: Sat, 15 Dec 2018 20:59:53 -0600
Subject: [PATCH 11/20] Adjust TPM for length conversion

---
 edgePy/DGEList.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py
index 1c6aacd..95619a3 100644
--- a/edgePy/DGEList.py
+++ b/edgePy/DGEList.py
@@ -439,8 +439,8 @@ def tpm(
         rates = self.get_rates(gene_data)
         rate_sum = np.sum(rates)
 
-        # calculates tpm
-        counts = (rates / rate_sum) * 1e6
+        # calculates tpm and adds kilobase conversion from get_gene_mask_and_lengths
+        counts = (rates / rate_sum) * 1e9
 
         if transform_to_log:
             counts = self.log_transform(counts, prior_count)
@@ -462,7 +462,6 @@ def get_rates(
         gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data)
         counts = self.counts[gene_mask].copy()
 
-        gene_len_ordered = gene_len_ordered * 1e3
 
         # calculates counts per base (ie, the rate)
         rates = (counts.T / gene_len_ordered).T

From e1acb5f1cba07ee5965852478dfcd95a36d286c6 Mon Sep 17 00:00:00 2001
From: zbohannan <zbohannan@gmail.com>
Date: Sat, 15 Dec 2018 21:04:50 -0600
Subject: [PATCH 12/20] Testing kb conversion

---
 edgePy/DGEList.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py
index 95619a3..148bf13 100644
--- a/edgePy/DGEList.py
+++ b/edgePy/DGEList.py
@@ -440,7 +440,7 @@ def tpm(
         rate_sum = np.sum(rates)
 
         # calculates tpm and adds kilobase conversion from get_gene_mask_and_lengths
-        counts = (rates / rate_sum) * 1e9
+        counts = (rates / rate_sum) * 1e3
 
         if transform_to_log:
             counts = self.log_transform(counts, prior_count)

From 11653f95319e711a8abd8cef7cbaef381d67e2fa Mon Sep 17 00:00:00 2001
From: zbohannan <zbohannan@gmail.com>
Date: Sat, 15 Dec 2018 21:20:34 -0600
Subject: [PATCH 13/20] Add get_rates test v1

---
 tests/test_DGEList.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tests/test_DGEList.py b/tests/test_DGEList.py
index a581b11..779ad53 100644
--- a/tests/test_DGEList.py
+++ b/tests/test_DGEList.py
@@ -213,6 +213,22 @@ def test_rpkm():
     # RPKM=numReads / (geneLength / 1000 * totalNumReads / 1, 000, 000)
     assert rpm_dge.counts[0][0] == (first_pos / ((gene_len / 1e3) * (col_sum[0] / 1e6)))
 
+def test_get_rates():
+    dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ)))
+    icd = CanonicalDataStore(
+        get_dataset_path(TEST_GENE_SET_DATA), get_dataset_path(TEST_GENE_SYMBOLS)
+    )
+    first_pos = dge_list.counts[0][0]
+    first_gene = dge_list.genes[0]
+
+    assert isinstance(first_pos, np.integer)
+    ensg_gene = icd.pick_gene_id(icd.get_genes_from_symbol(first_gene))
+    gene_len = icd.get_length_of_canonical_transcript(ensg_gene)
+
+    rates = dge_list.get_rates(icd)
+
+    assert rates[0] == first_pos / gene_len
+
 def test_tpm():
     dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ)))
     icd = CanonicalDataStore(

From 166ab5c79e82069e32649e7d88e5fc142b9493d7 Mon Sep 17 00:00:00 2001
From: zbohannan <zbohannan@gmail.com>
Date: Sat, 15 Dec 2018 21:24:14 -0600
Subject: [PATCH 14/20] Test get_rates v2

---
 tests/test_DGEList.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_DGEList.py b/tests/test_DGEList.py
index 779ad53..4ac3f67 100644
--- a/tests/test_DGEList.py
+++ b/tests/test_DGEList.py
@@ -227,7 +227,7 @@ def test_get_rates():
 
     rates = dge_list.get_rates(icd)
 
-    assert rates[0] == first_pos / gene_len
+    assert rates[0][0] == first_pos / gene_len
 
 def test_tpm():
     dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ)))

From d94bdbcf20079251d5ea3a957e6064557dd21a28 Mon Sep 17 00:00:00 2001
From: zbohannan <zbohannan@gmail.com>
Date: Sat, 15 Dec 2018 21:32:46 -0600
Subject: [PATCH 15/20] Test get_rates v3

---
 edgePy/DGEList.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py
index 148bf13..a7a9e8e 100644
--- a/edgePy/DGEList.py
+++ b/edgePy/DGEList.py
@@ -464,7 +464,7 @@ def get_rates(
 
 
         # calculates counts per base (ie, the rate)
-        rates = (counts.T / gene_len_ordered).T
+        rates = (counts.T / (gene_len_ordered * 1e3)).T
 
         return rates
 

From 36af95eafd0b26f1db33c536b73d171ed81764f2 Mon Sep 17 00:00:00 2001
From: zbohannan <zbohannan@gmail.com>
Date: Sat, 15 Dec 2018 21:38:42 -0600
Subject: [PATCH 16/20] Add get rates test v4

---
 edgePy/DGEList.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py
index a7a9e8e..0914d63 100644
--- a/edgePy/DGEList.py
+++ b/edgePy/DGEList.py
@@ -464,8 +464,10 @@ def get_rates(
 
 
         # calculates counts per base (ie, the rate)
-        rates = (counts.T / (gene_len_ordered * 1e3)).T
+        rates = (counts.T / gene_len_ordered).T
 
+        rates = rates * 1e3
+        
         return rates
 
     def __repr__(self) -> str:

From 77418a639aea047ba37431694ea94efda8a4d11f Mon Sep 17 00:00:00 2001
From: zbohannan <zbohannan@gmail.com>
Date: Sat, 15 Dec 2018 21:45:08 -0600
Subject: [PATCH 17/20] Add get rates test v5

---
 edgePy/DGEList.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py
index 0914d63..fda5a38 100644
--- a/edgePy/DGEList.py
+++ b/edgePy/DGEList.py
@@ -466,8 +466,8 @@ def get_rates(
         # calculates counts per base (ie, the rate)
         rates = (counts.T / gene_len_ordered).T
 
-        rates = rates * 1e3
-        
+        rates = rates / 1e3
+
         return rates
 
     def __repr__(self) -> str:

From 7da1f8f37d40d7d5891c8528ab23e0ceecd74dd6 Mon Sep 17 00:00:00 2001
From: zbohannan <zbohannan@gmail.com>
Date: Sat, 15 Dec 2018 21:52:32 -0600
Subject: [PATCH 18/20] Add get rate test v6

---
 edgePy/DGEList.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py
index fda5a38..4c3d6c4 100644
--- a/edgePy/DGEList.py
+++ b/edgePy/DGEList.py
@@ -440,7 +440,7 @@ def tpm(
         rate_sum = np.sum(rates)
 
         # calculates tpm and adds kilobase conversion from get_gene_mask_and_lengths
-        counts = (rates / rate_sum) * 1e3
+        counts = (rates / rate_sum) * 1e6
 
         if transform_to_log:
             counts = self.log_transform(counts, prior_count)

From 8295ac97d0f2ed1f1d1d6a54bc65b848913cc975 Mon Sep 17 00:00:00 2001
From: zbohannan <zbohannan@gmail.com>
Date: Mon, 17 Dec 2018 10:44:18 -0600
Subject: [PATCH 19/20] Reformat

---
 edgePy/DGEList.py     | 83 +++++++++++++++++++++----------------------
 tests/test_DGEList.py |  7 ++--
 2 files changed, 44 insertions(+), 46 deletions(-)

diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py
index 4c3d6c4..aba1a1e 100644
--- a/edgePy/DGEList.py
+++ b/edgePy/DGEList.py
@@ -52,17 +52,17 @@ class DGEList(object):
     )
 
     def __init__(
-        self,
-        counts: Optional[np.ndarray] = None,
-        samples: Optional[np.array] = None,
-        genes: Optional[np.array] = None,
-        norm_factors: Optional[np.array] = None,
-        groups_in_list: Optional[np.array] = None,
-        groups_in_dict: Optional[Dict] = None,
-        to_remove_zeroes: Optional[bool] = False,
-        filename: Optional[str] = None,
-        current_transform_type: Optional[str] = None,
-        current_log_status: Optional[bool] = False,
+            self,
+            counts: Optional[np.ndarray] = None,
+            samples: Optional[np.array] = None,
+            genes: Optional[np.array] = None,
+            norm_factors: Optional[np.array] = None,
+            groups_in_list: Optional[np.array] = None,
+            groups_in_dict: Optional[Dict] = None,
+            to_remove_zeroes: Optional[bool] = False,
+            filename: Optional[str] = None,
+            current_transform_type: Optional[str] = None,
+            current_log_status: Optional[bool] = False,
     ) -> None:
 
         self.to_remove_zeroes = to_remove_zeroes
@@ -108,16 +108,16 @@ def __init__(
                 )
 
     def copy(
-        self,
-        counts: Optional[np.ndarray] = None,
-        samples: Optional[np.array] = None,
-        genes: Optional[np.array] = None,
-        norm_factors: Optional[np.array] = None,
-        groups_in_list: Optional[np.array] = None,
-        groups_in_dict: Optional[Dict] = None,
-        to_remove_zeroes: Optional[bool] = False,
-        current_type: Optional[str] = None,
-        current_log: Optional[bool] = False,
+            self,
+            counts: Optional[np.ndarray] = None,
+            samples: Optional[np.array] = None,
+            genes: Optional[np.array] = None,
+            norm_factors: Optional[np.array] = None,
+            groups_in_list: Optional[np.array] = None,
+            groups_in_dict: Optional[Dict] = None,
+            to_remove_zeroes: Optional[bool] = False,
+            current_type: Optional[str] = None,
+            current_log: Optional[bool] = False,
     ) -> "DGEList":
 
         return DGEList(
@@ -323,10 +323,10 @@ def cpm(self, transform_to_log: bool = False, prior_count: float = PRIOR_COUNT)
         return self.copy(counts=counts, current_log=current_log)
 
     def rpkm(
-        self,
-        gene_data: CanonicalDataStore,
-        transform_to_log: bool = False,
-        prior_count: float = PRIOR_COUNT,
+            self,
+            gene_data: CanonicalDataStore,
+            transform_to_log: bool = False,
+            prior_count: float = PRIOR_COUNT,
     ) -> "DGEList":
         """Return the DGEList normalized to reads per kilobase of gene length
         per million reads. (RPKM =   numReads / ( geneLength/1000 * totalNumReads/1,000,000 )
@@ -399,10 +399,10 @@ def get_gene_mask_and_lengths(self, gene_data):
         return gene_len_ordered, gene_mask
 
     def tpm(
-        self,
-        gene_data: CanonicalDataStore,
-        transform_to_log: bool = False,
-        prior_count: float = PRIOR_COUNT,
+            self,
+            gene_data: CanonicalDataStore,
+            transform_to_log: bool = False,
+            prior_count: float = PRIOR_COUNT,
     ) -> "DGEList":
         """Normalize the DGEList to transcripts per million.
 
@@ -449,9 +449,9 @@ def tpm(
         return self.copy(counts=counts, current_log=current_log, genes=genes)
 
     def get_rates(
-        self,
-        gene_data: CanonicalDataStore
-        ) -> "DGEList":
+            self,
+            gene_data: CanonicalDataStore
+    ) -> "DGEList":
         """Gets the number of counts per base, otherwise known as the rate, for all genes.
         Currently used for TPM calculations.
 
@@ -462,7 +462,6 @@ def get_rates(
         gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data)
         counts = self.counts[gene_mask].copy()
 
-
         # calculates counts per base (ie, the rate)
         rates = (counts.T / gene_len_ordered).T
 
@@ -518,12 +517,12 @@ def read_npz_file(self, filename: str) -> None:
 
     @classmethod
     def create_DGEList(
-        cls,
-        sample_list: List[str],
-        data_set: Dict[Hashable, Any],  # {sample: {gene1: x, gene2: y}},
-        gene_list: List[str],
-        sample_to_category: Optional[List[str]] = None,
-        category_to_samples: Optional[Dict[Hashable, List[str]]] = None,
+            cls,
+            sample_list: List[str],
+            data_set: Dict[Hashable, Any],  # {sample: {gene1: x, gene2: y}},
+            gene_list: List[str],
+            sample_to_category: Optional[List[str]] = None,
+            category_to_samples: Optional[Dict[Hashable, List[str]]] = None,
     ) -> "DGEList":
         """ sample list and gene list must be pre-sorted
             Use this to create the DGE object for future work."""
@@ -548,7 +547,7 @@ def create_DGEList(
 
     @classmethod
     def create_DGEList_data_file(
-        cls, data_file: Path, group_file: Path, **kwargs: Mapping
+            cls, data_file: Path, group_file: Path, **kwargs: Mapping
     ) -> "DGEList":
         """Wrapper for creating DGEList objects from file locations.  Performs open and passes
         the file handles to the method for creating a DGEList object.
@@ -567,13 +566,13 @@ def create_DGEList_data_file(
 
         """
         with smart_open(data_file, 'r') as data_handle, smart_open(
-            group_file, 'r'
+                group_file, 'r'
         ) as group_handle:
             return cls.create_DGEList_handle(data_handle, group_handle, **kwargs)
 
     @classmethod
     def create_DGEList_handle(
-        cls, data_handle: StringIO, group_handle: StringIO, **kwargs: Mapping
+            cls, data_handle: StringIO, group_handle: StringIO, **kwargs: Mapping
     ) -> "DGEList":
         """Read in a file-like object of delimited data for instantiation.
 
diff --git a/tests/test_DGEList.py b/tests/test_DGEList.py
index 4ac3f67..c54e53b 100644
--- a/tests/test_DGEList.py
+++ b/tests/test_DGEList.py
@@ -18,7 +18,7 @@
 @pytest.fixture
 def dge_list():
     with smart_open(get_dataset_path(TEST_DATASET), 'r') as data_handle, smart_open(
-        get_dataset_path(TEST_GROUPS), 'r'
+            get_dataset_path(TEST_GROUPS), 'r'
     ) as group_handle:
         return DGEList.create_DGEList_handle(data_handle, group_handle)
 
@@ -40,7 +40,6 @@ def test_sample_group_list():
 
 
 def test_minimal_init():
-
     dge_list = DGEList(
         to_remove_zeroes=False,
         counts=np.ones(shape=(5, 5)),
@@ -92,7 +91,6 @@ def test_library_size():
 
 
 def test_setting_DGElist_counts():
-
     dge_list = DGEList(
         counts=np.zeros(shape=(5, 10)),
         groups_in_list=['A', 'A', 'B', 'B', 'B'],
@@ -116,7 +114,6 @@ def test_setting_DGElist_counts():
 
 
 def test_cycle_dge_npz():
-
     import tempfile
     import os
 
@@ -213,6 +210,7 @@ def test_rpkm():
     # RPKM=numReads / (geneLength / 1000 * totalNumReads / 1, 000, 000)
     assert rpm_dge.counts[0][0] == (first_pos / ((gene_len / 1e3) * (col_sum[0] / 1e6)))
 
+
 def test_get_rates():
     dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ)))
     icd = CanonicalDataStore(
@@ -229,6 +227,7 @@ def test_get_rates():
 
     assert rates[0][0] == first_pos / gene_len
 
+
 def test_tpm():
     dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ)))
     icd = CanonicalDataStore(

From 33699300b6e0ebc73d6ef4790f87a7072f7c2b8e Mon Sep 17 00:00:00 2001
From: zbohannan <zbohannan@gmail.com>
Date: Mon, 17 Dec 2018 11:44:02 -0600
Subject: [PATCH 20/20] Reformat

---
 docs/source/conf.py                  |  1 -
 edgePy/DGEList.py                    | 81 ++++++++++++++--------------
 tests/mongodb/test_gene_functions.py | 41 ++++++++------
 tests/test_DGEList.py                |  2 +-
 4 files changed, 65 insertions(+), 60 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 6c8991f..bc83afd 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -1,4 +1,3 @@
-
 # -*- coding: utf-8 -*-
 #
 # Configuration file for the Sphinx documentation builder.
diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py
index aba1a1e..614a3d8 100644
--- a/edgePy/DGEList.py
+++ b/edgePy/DGEList.py
@@ -52,17 +52,17 @@ class DGEList(object):
     )
 
     def __init__(
-            self,
-            counts: Optional[np.ndarray] = None,
-            samples: Optional[np.array] = None,
-            genes: Optional[np.array] = None,
-            norm_factors: Optional[np.array] = None,
-            groups_in_list: Optional[np.array] = None,
-            groups_in_dict: Optional[Dict] = None,
-            to_remove_zeroes: Optional[bool] = False,
-            filename: Optional[str] = None,
-            current_transform_type: Optional[str] = None,
-            current_log_status: Optional[bool] = False,
+        self,
+        counts: Optional[np.ndarray] = None,
+        samples: Optional[np.array] = None,
+        genes: Optional[np.array] = None,
+        norm_factors: Optional[np.array] = None,
+        groups_in_list: Optional[np.array] = None,
+        groups_in_dict: Optional[Dict] = None,
+        to_remove_zeroes: Optional[bool] = False,
+        filename: Optional[str] = None,
+        current_transform_type: Optional[str] = None,
+        current_log_status: Optional[bool] = False,
     ) -> None:
 
         self.to_remove_zeroes = to_remove_zeroes
@@ -108,16 +108,16 @@ def __init__(
                 )
 
     def copy(
-            self,
-            counts: Optional[np.ndarray] = None,
-            samples: Optional[np.array] = None,
-            genes: Optional[np.array] = None,
-            norm_factors: Optional[np.array] = None,
-            groups_in_list: Optional[np.array] = None,
-            groups_in_dict: Optional[Dict] = None,
-            to_remove_zeroes: Optional[bool] = False,
-            current_type: Optional[str] = None,
-            current_log: Optional[bool] = False,
+        self,
+        counts: Optional[np.ndarray] = None,
+        samples: Optional[np.array] = None,
+        genes: Optional[np.array] = None,
+        norm_factors: Optional[np.array] = None,
+        groups_in_list: Optional[np.array] = None,
+        groups_in_dict: Optional[Dict] = None,
+        to_remove_zeroes: Optional[bool] = False,
+        current_type: Optional[str] = None,
+        current_log: Optional[bool] = False,
     ) -> "DGEList":
 
         return DGEList(
@@ -323,10 +323,10 @@ def cpm(self, transform_to_log: bool = False, prior_count: float = PRIOR_COUNT)
         return self.copy(counts=counts, current_log=current_log)
 
     def rpkm(
-            self,
-            gene_data: CanonicalDataStore,
-            transform_to_log: bool = False,
-            prior_count: float = PRIOR_COUNT,
+        self,
+        gene_data: CanonicalDataStore,
+        transform_to_log: bool = False,
+        prior_count: float = PRIOR_COUNT,
     ) -> "DGEList":
         """Return the DGEList normalized to reads per kilobase of gene length
         per million reads. (RPKM =   numReads / ( geneLength/1000 * totalNumReads/1,000,000 )
@@ -399,10 +399,10 @@ def get_gene_mask_and_lengths(self, gene_data):
         return gene_len_ordered, gene_mask
 
     def tpm(
-            self,
-            gene_data: CanonicalDataStore,
-            transform_to_log: bool = False,
-            prior_count: float = PRIOR_COUNT,
+        self,
+        gene_data: CanonicalDataStore,
+        transform_to_log: bool = False,
+        prior_count: float = PRIOR_COUNT,
     ) -> "DGEList":
         """Normalize the DGEList to transcripts per million.
 
@@ -448,10 +448,7 @@ def tpm(
 
         return self.copy(counts=counts, current_log=current_log, genes=genes)
 
-    def get_rates(
-            self,
-            gene_data: CanonicalDataStore
-    ) -> "DGEList":
+    def get_rates(self, gene_data: CanonicalDataStore) -> "DGEList":
         """Gets the number of counts per base, otherwise known as the rate, for all genes.
         Currently used for TPM calculations.
 
@@ -517,12 +514,12 @@ def read_npz_file(self, filename: str) -> None:
 
     @classmethod
     def create_DGEList(
-            cls,
-            sample_list: List[str],
-            data_set: Dict[Hashable, Any],  # {sample: {gene1: x, gene2: y}},
-            gene_list: List[str],
-            sample_to_category: Optional[List[str]] = None,
-            category_to_samples: Optional[Dict[Hashable, List[str]]] = None,
+        cls,
+        sample_list: List[str],
+        data_set: Dict[Hashable, Any],  # {sample: {gene1: x, gene2: y}},
+        gene_list: List[str],
+        sample_to_category: Optional[List[str]] = None,
+        category_to_samples: Optional[Dict[Hashable, List[str]]] = None,
     ) -> "DGEList":
         """ sample list and gene list must be pre-sorted
             Use this to create the DGE object for future work."""
@@ -547,7 +544,7 @@ def create_DGEList(
 
     @classmethod
     def create_DGEList_data_file(
-            cls, data_file: Path, group_file: Path, **kwargs: Mapping
+        cls, data_file: Path, group_file: Path, **kwargs: Mapping
     ) -> "DGEList":
         """Wrapper for creating DGEList objects from file locations.  Performs open and passes
         the file handles to the method for creating a DGEList object.
@@ -566,13 +563,13 @@ def create_DGEList_data_file(
 
         """
         with smart_open(data_file, 'r') as data_handle, smart_open(
-                group_file, 'r'
+            group_file, 'r'
         ) as group_handle:
             return cls.create_DGEList_handle(data_handle, group_handle, **kwargs)
 
     @classmethod
     def create_DGEList_handle(
-            cls, data_handle: StringIO, group_handle: StringIO, **kwargs: Mapping
+        cls, data_handle: StringIO, group_handle: StringIO, **kwargs: Mapping
     ) -> "DGEList":
         """Read in a file-like object of delimited data for instantiation.
 
diff --git a/tests/mongodb/test_gene_functions.py b/tests/mongodb/test_gene_functions.py
index 9a41aa2..a667a22 100644
--- a/tests/mongodb/test_gene_functions.py
+++ b/tests/mongodb/test_gene_functions.py
@@ -16,28 +16,34 @@
             "size": 720,
             "canonical": "0",
             "exons": {
-                "ENSE00002642039": {"raw": 6.435643564356435, "rpkm": 0.3682833603326866},
-                "ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676},
+                "ENSE00002642039": {"raw": 6.435_643_564_356_435, "rpkm": 0.368_283_360_332_686_6},
+                "ENSE00002663544": {"raw": 1.960_896_089_608_961, "rpkm": 0.518_986_943_091_667_6},
             },
-            "rpkm": 0.39507510837872767,
+            "rpkm": 0.395_075_108_378_727_67,
         },
         "ENST00000576696": {
             "size": 1306,
             "canonical": "0",
             "exons": {
-                "ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676},
-                "ENSE00002672617": {"raw": 5.564356435643564, "rpkm": 0.16002265523850875},
+                "ENSE00002663544": {"raw": 1.960_896_089_608_961, "rpkm": 0.518_986_943_091_667_6},
+                "ENSE00002672617": {
+                    "raw": 5.564_356_435_643_564,
+                    "rpkm": 0.160_022_655_238_508_75,
+                },
             },
-            "rpkm": 0.195204453741728,
+            "rpkm": 0.195_204_453_741_728,
         },
         "ENST00000443778": {
             "size": 2084,
             "canonical": "1",
             "exons": {
-                "ENSE00001729822": {"raw": 2, "rpkm": 0.0997865579744511},
-                "ENSE00001608298": {"raw": 1.1777177717771776, "rpkm": 0.22669418591124607},
+                "ENSE00001729822": {"raw": 2, "rpkm": 0.099_786_557_974_451_1},
+                "ENSE00001608298": {
+                    "raw": 1.177_717_771_777_177_6,
+                    "rpkm": 0.226_694_185_911_246_07,
+                },
             },
-            "rpkm": 0.05165702955135873,
+            "rpkm": 0.051_657_029_551_358_73,
         },
     },
     "star_rpkm": None,
@@ -53,19 +59,22 @@
             "size": 720,
             "canonical": "0",
             "exons": {
-                "ENSE00002642039": {"raw": 6.435643564356435, "rpkm": 0.3682833603326866},
-                "ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676},
+                "ENSE00002642039": {"raw": 6.435_643_564_356_435, "rpkm": 0.368_283_360_332_686_6},
+                "ENSE00002663544": {"raw": 1.960_896_089_608_961, "rpkm": 0.518_986_943_091_667_6},
             },
-            "rpkm": 0.39507510837872767,
+            "rpkm": 0.395_075_108_378_727_67,
         },
         "ENST00000576696": {
             "size": 1306,
             "canonical": "0",
             "exons": {
-                "ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676},
-                "ENSE00002672617": {"raw": 5.564356435643564, "rpkm": 0.16002265523850875},
+                "ENSE00002663544": {"raw": 1.960_896_089_608_961, "rpkm": 0.518_986_943_091_667_6},
+                "ENSE00002672617": {
+                    "raw": 5.564_356_435_643_564,
+                    "rpkm": 0.160_022_655_238_508_75,
+                },
             },
-            "rpkm": 0.195204453741728,
+            "rpkm": 0.195_204_453_741_728,
         },
     },
     "star_rpkm": None,
@@ -137,7 +146,7 @@ def test_get_sample_details(mongodb):
 
 def test_get_canonical_rpkm():
     rpkm = get_canonical_rpkm(RNASeq_RECORD)
-    assert rpkm == 0.05165702955135873
+    assert rpkm == 0.051_657_029_551_358_73
 
 
 def test_get_canonical_rpkm_no_canonical():
diff --git a/tests/test_DGEList.py b/tests/test_DGEList.py
index c54e53b..bb3e00a 100644
--- a/tests/test_DGEList.py
+++ b/tests/test_DGEList.py
@@ -18,7 +18,7 @@
 @pytest.fixture
 def dge_list():
     with smart_open(get_dataset_path(TEST_DATASET), 'r') as data_handle, smart_open(
-            get_dataset_path(TEST_GROUPS), 'r'
+        get_dataset_path(TEST_GROUPS), 'r'
     ) as group_handle:
         return DGEList.create_DGEList_handle(data_handle, group_handle)