From 945c57177794c2c947517c49f8e1883a0b898ac0 Mon Sep 17 00:00:00 2001 From: zbohannan Date: Tue, 20 Nov 2018 11:14:34 -0600 Subject: [PATCH 01/20] Change fn sigs --- edgePy/DGEList.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py index b345d14..1fa3138 100644 --- a/edgePy/DGEList.py +++ b/edgePy/DGEList.py @@ -400,7 +400,7 @@ def get_gene_mask_and_lengths(self, gene_data): def tpm( self, - gene_lengths: np.ndarray, + gene_data: CanonicalDataStore, transform_to_log: bool = False, prior_count: float = PRIOR_COUNT, mean_fragment_lengths: np.ndarray = None, @@ -419,9 +419,9 @@ def tpm( \\left(\\frac{1}{\sum_j \\frac{X_j}{\widetilde{l_j}}}\\right) \cdot 10^6 Args: - gene_lengths: 1D array of gene lengths for each gene in the rows of `DGEList.counts`. + gene_data: An object that works to import Ensembl based data, for use in calculations transform_to_log: store log outputs - prior_count: + prior_count: a minimum value for genes, if you do log transforms mean_fragment_lengths: 1D array of mean fragment lengths for each sample in the columns of `DGEList.counts` (optional) @@ -430,10 +430,10 @@ def tpm( # compute effective length not allowing negative lengths if mean_fragment_lengths: effective_lengths = ( - gene_lengths[:, np.newaxis] - mean_fragment_lengths[np.newaxis, :] + gene_data[:, np.newaxis] - mean_fragment_lengths[np.newaxis, :] ).clip(min=1) else: - effective_lengths = gene_lengths[:, np.newaxis] + effective_lengths = gene_data[:, np.newaxis] # how many counts per base base_counts = self.counts / effective_lengths From a2fdfb086f4a615cb778dd5b217402e9dec120ff Mon Sep 17 00:00:00 2001 From: zbohannan Date: Sat, 24 Nov 2018 14:07:01 -0600 Subject: [PATCH 02/20] Harmonize TPM RPKM --- edgePy/DGEList.py | 49 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py index 1fa3138..f708cdb 100644 --- a/edgePy/DGEList.py +++ b/edgePy/DGEList.py @@ -357,6 +357,46 @@ def rpkm( return self.copy(counts=counts, current_log=current_log, genes=genes) + def tpm( + self, + gene_data: CanonicalDataStore, + transform_to_log: bool = False, + prior_count: float = PRIOR_COUNT, + ) -> "DGEList": + """Return the DGEList normalized to transcripts per million reads. + TPM = countsPerBase * ( 1/sum[countsPerBase]) * 10^6 + + Args: + gene_data: An object that works to import Ensembl based data, for use in calculations + transform_to_log: true, if you wish to convert to log after converting to RPKM + prior_count: a minimum value for genes, if you do log transforms. + """ + current_log = self.current_log_status + + #checks current log status and converts if currently in log format + if self.current_log_status: + self.counts = np.exp(self.counts) + current_log = False + + #returns canonical gene length and gene mask + gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data) + + genes = self.genes[gene_mask].copy() + counts = self.counts[gene_mask].copy() + + #calculates counts per base (ie, the rate) + rate = (counts.T / gene_len_ordered).T + + #calculates tpm + counts = (rate / np.sum(rate)) * 1e6 + + if transform_to_log: + counts = self.log_transform(counts, prior_count) + current_log = True + + return self.copy(counts=counts, current_log=current_log, genes=genes) + + def get_gene_mask_and_lengths(self, gene_data): """ @@ -398,9 +438,10 @@ def get_gene_mask_and_lengths(self, gene_data): gene_mask.append(False) return gene_len_ordered, gene_mask +'''Commented out for now. May reuse later, depending on approach to user-defined DGELists def tpm( self, - gene_data: CanonicalDataStore, + gene_lengths: np.ndarray, transform_to_log: bool = False, prior_count: float = PRIOR_COUNT, mean_fragment_lengths: np.ndarray = None, @@ -430,10 +471,10 @@ def tpm( # compute effective length not allowing negative lengths if mean_fragment_lengths: effective_lengths = ( - gene_data[:, np.newaxis] - mean_fragment_lengths[np.newaxis, :] + gene_lengths[:, np.newaxis] - mean_fragment_lengths[np.newaxis, :] ).clip(min=1) else: - effective_lengths = gene_data[:, np.newaxis] + effective_lengths = gene_lengths[:, np.newaxis] # how many counts per base base_counts = self.counts / effective_lengths @@ -445,7 +486,7 @@ def tpm( current_log = True return self.copy(counts=counts, current_log=current_log) - +''' def __repr__(self) -> str: """Give a pretty non-executeable representation of this object.""" num_samples = len(self._samples) if self._samples is not None else 0 From 53bff6919b67862599614022f8be9d5e69647ee3 Mon Sep 17 00:00:00 2001 From: zbohannan Date: Sat, 1 Dec 2018 14:53:57 -0600 Subject: [PATCH 03/20] Start work on customdatastore --- edgePy/data_import/user_defined.py | 57 ++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 edgePy/data_import/user_defined.py diff --git a/edgePy/data_import/user_defined.py b/edgePy/data_import/user_defined.py new file mode 100644 index 0000000..7ad8ce2 --- /dev/null +++ b/edgePy/data_import/user_defined.py @@ -0,0 +1,57 @@ +import numpy as np + +class CustomDataStore(object): + """ + A data structure allowing user-defined gene data; analogous to CanonicalDataStore generated from Ensembl data. + + Args: + genes: list of gene names + transcripts: list of transcript names + lengths: length of transcripts + canonical: defines whether transcript is canonical or not + symbols: list of gene symbols + """ + + def __init__( + self, + genes: Optional[np.array] = None, + transcripts: Optional[np.ndarray] = None, + lengths: Optional[np.ndarray] = None, + canonical: Optional[np.ndarray] = None, + symbols: Optional[np.ndarray] = None + ) -> None: + + if genes or transcripts or lengths or symbols is None: + raise Exception("All DataStore components must be provided at init") + else: + if genes.size != transcripts.size: + raise Exception("Gene array and transcript array are not the same size") + if genes.size != lengths.size: + raise Exception("Gene array and length array are not the same size") + if genes.size != symbols.size: + raise Exception("Gene array and symbol array are not the same size") + if canonical is None: + for i in genes: + canonical[i] = True + log.info(f"No canonical status input. All input lengths assumed to be canonical.") + + self.genes = genes + self.transcripts = transcripts + self.lengths = lengths + self.canonical = canonical + self.symbols = symbols + + @classmethod + def create_CustomDataStore( + cls, + gene_list: List[str], + transcript_list: List[str], + length_list: List[int], + is_canonical: List[bool], + symbols_list: List[str] + ) -> "CustomDataStore" : + """ + Creates a CustomDataStore for use with DGElist operations. Alternative for data lacking Ensembl files. + """ + + log.info(f"Creating CustomDataStore object using user-defined data") \ No newline at end of file From 17217234b0270dd046b42737e62e53709eab9d72 Mon Sep 17 00:00:00 2001 From: zbohannan Date: Tue, 11 Dec 2018 14:02:51 -0600 Subject: [PATCH 04/20] TPM fn sig harmonized to RPKM --- edgePy/DGEList.py | 6 ++-- edgePy/data_import/user_defined.py | 57 ------------------------------ tests/test_DGEList.py | 18 +++++++++- 3 files changed, 21 insertions(+), 60 deletions(-) delete mode 100644 edgePy/data_import/user_defined.py diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py index f708cdb..a2f84ad 100644 --- a/edgePy/DGEList.py +++ b/edgePy/DGEList.py @@ -365,6 +365,8 @@ def tpm( ) -> "DGEList": """Return the DGEList normalized to transcripts per million reads. TPM = countsPerBase * ( 1/sum[countsPerBase]) * 10^6 + From Harold Pimentel; "What is FPKM? A review of RNA-Seq expression units" + https://haroldpimentel.wordpress.com/2014/05/08/what-the-fpkm-a-review-rna-seq-expression-units/ Args: gene_data: An object that works to import Ensembl based data, for use in calculations @@ -438,7 +440,7 @@ def get_gene_mask_and_lengths(self, gene_data): gene_mask.append(False) return gene_len_ordered, gene_mask -'''Commented out for now. May reuse later, depending on approach to user-defined DGELists + '''Commented out for now. May reuse later, depending on approach to user-defined DGELists def tpm( self, gene_lengths: np.ndarray, @@ -486,7 +488,7 @@ def tpm( current_log = True return self.copy(counts=counts, current_log=current_log) -''' + ''' def __repr__(self) -> str: """Give a pretty non-executeable representation of this object.""" num_samples = len(self._samples) if self._samples is not None else 0 diff --git a/edgePy/data_import/user_defined.py b/edgePy/data_import/user_defined.py deleted file mode 100644 index 7ad8ce2..0000000 --- a/edgePy/data_import/user_defined.py +++ /dev/null @@ -1,57 +0,0 @@ -import numpy as np - -class CustomDataStore(object): - """ - A data structure allowing user-defined gene data; analogous to CanonicalDataStore generated from Ensembl data. - - Args: - genes: list of gene names - transcripts: list of transcript names - lengths: length of transcripts - canonical: defines whether transcript is canonical or not - symbols: list of gene symbols - """ - - def __init__( - self, - genes: Optional[np.array] = None, - transcripts: Optional[np.ndarray] = None, - lengths: Optional[np.ndarray] = None, - canonical: Optional[np.ndarray] = None, - symbols: Optional[np.ndarray] = None - ) -> None: - - if genes or transcripts or lengths or symbols is None: - raise Exception("All DataStore components must be provided at init") - else: - if genes.size != transcripts.size: - raise Exception("Gene array and transcript array are not the same size") - if genes.size != lengths.size: - raise Exception("Gene array and length array are not the same size") - if genes.size != symbols.size: - raise Exception("Gene array and symbol array are not the same size") - if canonical is None: - for i in genes: - canonical[i] = True - log.info(f"No canonical status input. All input lengths assumed to be canonical.") - - self.genes = genes - self.transcripts = transcripts - self.lengths = lengths - self.canonical = canonical - self.symbols = symbols - - @classmethod - def create_CustomDataStore( - cls, - gene_list: List[str], - transcript_list: List[str], - length_list: List[int], - is_canonical: List[bool], - symbols_list: List[str] - ) -> "CustomDataStore" : - """ - Creates a CustomDataStore for use with DGElist operations. Alternative for data lacking Ensembl files. - """ - - log.info(f"Creating CustomDataStore object using user-defined data") \ No newline at end of file diff --git a/tests/test_DGEList.py b/tests/test_DGEList.py index bf208bc..3bdaefd 100644 --- a/tests/test_DGEList.py +++ b/tests/test_DGEList.py @@ -213,7 +213,23 @@ def test_rpkm(): # RPKM=numReads / (geneLength / 1000 * totalNumReads / 1, 000, 000) assert rpm_dge.counts[0][0] == (first_pos / ((gene_len / 1e3) * (col_sum[0] / 1e6))) +def test_tpm(): + dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ))) + icd = CanonicalDataStore( + get_dataset_path(TEST_GENE_SET_DATA), get_dataset_path(TEST_GENE_SYMBOLS) + ) + first_pos = dge_list.counts[0][0] + first_gene = dge_list.genes[0] + col_sum = np.sum(dge_list.counts, axis=0) + assert isinstance(first_pos, np.integer) + tpm_dge = dge_list.tpm(icd) + ensg_gene = icd.pick_gene_id(icd.get_genes_from_symbol(first_gene)) + gene_len = icd.get_length_of_canonical_transcript(ensg_gene) + + # TPM=countsPerBase * (1/sum[countsPerBase]) * 10^6 + assert tpm_dge.counts[0][0] == (first_pos / col_sum[0]) * 1e6 +''' def test_tpm(): # example hand calculated as in https://www.youtube.com/watch?time_continue=611&v=TTUrtCY2k-w counts = np.array([[10, 12, 30], [20, 25, 60], [5, 8, 15], [0, 0, 1]]) @@ -242,7 +258,7 @@ def test_tpm(): # make sure that the sums of all genes across are the same the each sample (an important property of TPM) gene_sums = new_dge_list.counts.sum(axis=0) assert np.allclose(gene_sums, [gene_sums[0]] * len(gene_sums)) - +''' # Unit tests for ``edgePy.data_import.Importer``.\ def test_init(): From b925fa775208a1230f96527fdf276cd22a75f7fe Mon Sep 17 00:00:00 2001 From: zbohannan Date: Wed, 12 Dec 2018 10:28:23 -0600 Subject: [PATCH 05/20] Address review issues --- edgePy/DGEList.py | 89 ++++++++++++------------------------------- tests/test_DGEList.py | 37 +++--------------- 2 files changed, 31 insertions(+), 95 deletions(-) diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py index a2f84ad..f979944 100644 --- a/edgePy/DGEList.py +++ b/edgePy/DGEList.py @@ -357,48 +357,6 @@ def rpkm( return self.copy(counts=counts, current_log=current_log, genes=genes) - def tpm( - self, - gene_data: CanonicalDataStore, - transform_to_log: bool = False, - prior_count: float = PRIOR_COUNT, - ) -> "DGEList": - """Return the DGEList normalized to transcripts per million reads. - TPM = countsPerBase * ( 1/sum[countsPerBase]) * 10^6 - From Harold Pimentel; "What is FPKM? A review of RNA-Seq expression units" - https://haroldpimentel.wordpress.com/2014/05/08/what-the-fpkm-a-review-rna-seq-expression-units/ - - Args: - gene_data: An object that works to import Ensembl based data, for use in calculations - transform_to_log: true, if you wish to convert to log after converting to RPKM - prior_count: a minimum value for genes, if you do log transforms. - """ - current_log = self.current_log_status - - #checks current log status and converts if currently in log format - if self.current_log_status: - self.counts = np.exp(self.counts) - current_log = False - - #returns canonical gene length and gene mask - gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data) - - genes = self.genes[gene_mask].copy() - counts = self.counts[gene_mask].copy() - - #calculates counts per base (ie, the rate) - rate = (counts.T / gene_len_ordered).T - - #calculates tpm - counts = (rate / np.sum(rate)) * 1e6 - - if transform_to_log: - counts = self.log_transform(counts, prior_count) - current_log = True - - return self.copy(counts=counts, current_log=current_log, genes=genes) - - def get_gene_mask_and_lengths(self, gene_data): """ @@ -440,18 +398,18 @@ def get_gene_mask_and_lengths(self, gene_data): gene_mask.append(False) return gene_len_ordered, gene_mask - '''Commented out for now. May reuse later, depending on approach to user-defined DGELists def tpm( self, - gene_lengths: np.ndarray, + gene_data: CanonicalDataStore, transform_to_log: bool = False, prior_count: float = PRIOR_COUNT, - mean_fragment_lengths: np.ndarray = None, ) -> "DGEList": """Normalize the DGEList to transcripts per million. - Adapted from Wagner, et al. 'Measurement of mRNA abundance using RNA-seq data: - RPKM measure is inconsistent among samples.' doi:10.1007/s12064-012-0162-3 + Adapted from Li et al. "RNA-Seq gene expression estimation with read mapping uncertainty." + Bioinformatics, Volume 26, Issue 4, 15 February 2010, Pages 493–500, + https://doi.org/10.1093/bioinformatics/btp692 + Read counts :math:`X_i` (for each gene :math:`i` with gene length :math:`\widetilde{l_j}` ) are normalized as follows: @@ -463,32 +421,35 @@ def tpm( Args: gene_data: An object that works to import Ensembl based data, for use in calculations - transform_to_log: store log outputs - prior_count: a minimum value for genes, if you do log transforms - mean_fragment_lengths: 1D array of mean fragment lengths for each sample in the columns of `DGEList.counts` - (optional) + transform_to_log: true, if you wish to convert to log after converting to RPKM + prior_count: a minimum value for genes, if you do log transforms. """ + current_log = self.current_log_status - # compute effective length not allowing negative lengths - if mean_fragment_lengths: - effective_lengths = ( - gene_lengths[:, np.newaxis] - mean_fragment_lengths[np.newaxis, :] - ).clip(min=1) - else: - effective_lengths = gene_lengths[:, np.newaxis] + # checks current log status and converts if currently in log format + if self.current_log_status: + self.counts = np.exp(self.counts) + current_log = False - # how many counts per base - base_counts = self.counts / effective_lengths + # returns canonical gene length and gene mask + gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data) + + genes = self.genes[gene_mask].copy() + counts = self.counts[gene_mask].copy() + + # calculates counts per base (ie, the rate) + rate = (counts.T / gene_len_ordered).T + + # calculates tpm + counts = (rate / np.sum(rate)) * 1e6 - counts = 1e6 * base_counts / np.sum(base_counts, axis=0)[np.newaxis, :] - current_log = self.current_log_status if transform_to_log: counts = self.log_transform(counts, prior_count) current_log = True - return self.copy(counts=counts, current_log=current_log) - ''' + return self.copy(counts=counts, current_log=current_log, genes=genes) + def __repr__(self) -> str: """Give a pretty non-executeable representation of this object.""" num_samples = len(self._samples) if self._samples is not None else 0 diff --git a/tests/test_DGEList.py b/tests/test_DGEList.py index 3bdaefd..f30b61b 100644 --- a/tests/test_DGEList.py +++ b/tests/test_DGEList.py @@ -221,44 +221,19 @@ def test_tpm(): first_pos = dge_list.counts[0][0] first_gene = dge_list.genes[0] - col_sum = np.sum(dge_list.counts, axis=0) + gene_len_ordered, gene_mask = dge_list.get_gene_mask_and_lengths(icd) + + rate = (dge_list.counts.T / gene_len_ordered).T + rate_sum = np.sum(rate) + assert isinstance(first_pos, np.integer) tpm_dge = dge_list.tpm(icd) ensg_gene = icd.pick_gene_id(icd.get_genes_from_symbol(first_gene)) gene_len = icd.get_length_of_canonical_transcript(ensg_gene) # TPM=countsPerBase * (1/sum[countsPerBase]) * 10^6 - assert tpm_dge.counts[0][0] == (first_pos / col_sum[0]) * 1e6 -''' -def test_tpm(): - # example hand calculated as in https://www.youtube.com/watch?time_continue=611&v=TTUrtCY2k-w - counts = np.array([[10, 12, 30], [20, 25, 60], [5, 8, 15], [0, 0, 1]]) - gene_lengths = np.array([2000, 4000, 1000, 10000]) - - expected = np.array( - [ - [333_333.333_333_33, 296_296.296_296_3, 332_594.235_033_26], - [333_333.333_333_33, 308_641.975_308_64, 332_594.235_033_26], - [333_333.333_333_33, 395_061.728_395_06, 332_594.235_033_26], - [0.0, 0.0, 2217.294_900_22], - ] - ) - - dge_list = DGEList( - counts=counts, - samples=np.array(['a', 'b', 'c']), - genes=np.array(['a', 'b', 'c', 'd']), - groups_in_dict={'group1': ['a', 'c'], 'group2': ['b', 'd']}, - ) - assert isinstance(dge_list.counts[0][0], np.integer) - new_dge_list = dge_list.tpm(gene_lengths) - - assert np.allclose(new_dge_list.counts, expected, atol=1e-1) + assert tpm_dge.counts[0][0] == ((first_pos / gene_len) / rate_sum) * 1e6 - # make sure that the sums of all genes across are the same the each sample (an important property of TPM) - gene_sums = new_dge_list.counts.sum(axis=0) - assert np.allclose(gene_sums, [gene_sums[0]] * len(gene_sums)) -''' # Unit tests for ``edgePy.data_import.Importer``.\ def test_init(): From 7008d8b1bfe7dc02c6e8d3fc2783c07aea26d7e0 Mon Sep 17 00:00:00 2001 From: zbohannan Date: Wed, 12 Dec 2018 11:14:30 -0600 Subject: [PATCH 06/20] Created new rate fn for TPM and test --- edgePy/DGEList.py | 33 +++++++++++++++++++++++---------- tests/test_DGEList.py | 5 +++-- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py index f979944..1888d88 100644 --- a/edgePy/DGEList.py +++ b/edgePy/DGEList.py @@ -421,7 +421,7 @@ def tpm( Args: gene_data: An object that works to import Ensembl based data, for use in calculations - transform_to_log: true, if you wish to convert to log after converting to RPKM + transform_to_log: true, if you wish to convert to log after converting to TPM prior_count: a minimum value for genes, if you do log transforms. """ @@ -432,17 +432,11 @@ def tpm( self.counts = np.exp(self.counts) current_log = False - # returns canonical gene length and gene mask - gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data) - - genes = self.genes[gene_mask].copy() - counts = self.counts[gene_mask].copy() - - # calculates counts per base (ie, the rate) - rate = (counts.T / gene_len_ordered).T + rates = self.get_rates(gene_data) + rate_sum = np.sum(rates) # calculates tpm - counts = (rate / np.sum(rate)) * 1e6 + counts = (rates / rate_sum) * 1e6 if transform_to_log: counts = self.log_transform(counts, prior_count) @@ -450,6 +444,25 @@ def tpm( return self.copy(counts=counts, current_log=current_log, genes=genes) + def get_rates( + self, + gene_data: CanonicalDataStore + ) -> "DGEList": + """Gets the number of counts per base, otherwise known as the rate, for all genes. + Currently used for TPM calculations. + + Args: + gene_data: An object that works to import Ensembl based data, for use in calculations + """ + + gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data) + counts = self.counts[gene_mask].copy() + + # calculates counts per base (ie, the rate) + rates = (counts.T / gene_len_ordered).T + + return rates + def __repr__(self) -> str: """Give a pretty non-executeable representation of this object.""" num_samples = len(self._samples) if self._samples is not None else 0 diff --git a/tests/test_DGEList.py b/tests/test_DGEList.py index f30b61b..4078d24 100644 --- a/tests/test_DGEList.py +++ b/tests/test_DGEList.py @@ -223,8 +223,9 @@ def test_tpm(): gene_len_ordered, gene_mask = dge_list.get_gene_mask_and_lengths(icd) - rate = (dge_list.counts.T / gene_len_ordered).T - rate_sum = np.sum(rate) + rates = dge_list.get_rates(icd) + + rate_sum = np.sum(rates) assert isinstance(first_pos, np.integer) tpm_dge = dge_list.tpm(icd) From 55c7e2655dfae0d23ff02d6efac2a4a6c3c7032b Mon Sep 17 00:00:00 2001 From: zbohannan Date: Wed, 12 Dec 2018 11:24:49 -0600 Subject: [PATCH 07/20] Minor cleanup --- edgePy/DGEList.py | 1 + 1 file changed, 1 insertion(+) diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py index 1888d88..7407376 100644 --- a/edgePy/DGEList.py +++ b/edgePy/DGEList.py @@ -432,6 +432,7 @@ def tpm( self.counts = np.exp(self.counts) current_log = False + genes = self.genes[gene_mask].copy() rates = self.get_rates(gene_data) rate_sum = np.sum(rates) From 91216648fa3c54b052a020d9752d8ea9cb69a58f Mon Sep 17 00:00:00 2001 From: zbohannan Date: Wed, 12 Dec 2018 11:29:17 -0600 Subject: [PATCH 08/20] Fix minor errors --- edgePy/DGEList.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py index 7407376..ef98607 100644 --- a/edgePy/DGEList.py +++ b/edgePy/DGEList.py @@ -432,7 +432,10 @@ def tpm( self.counts = np.exp(self.counts) current_log = False + gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data) + genes = self.genes[gene_mask].copy() + rates = self.get_rates(gene_data) rate_sum = np.sum(rates) From eba0419df7e60620c8cc8a841550196bf68ea51b Mon Sep 17 00:00:00 2001 From: zbohannan Date: Wed, 12 Dec 2018 11:57:52 -0600 Subject: [PATCH 09/20] Fix PR issues, add fn to get counts per base --- edgePy/DGEList.py | 2 +- tests/test_DGEList.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py index ef98607..bd2c072 100644 --- a/edgePy/DGEList.py +++ b/edgePy/DGEList.py @@ -435,7 +435,7 @@ def tpm( gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data) genes = self.genes[gene_mask].copy() - + rates = self.get_rates(gene_data) rate_sum = np.sum(rates) diff --git a/tests/test_DGEList.py b/tests/test_DGEList.py index 4078d24..a581b11 100644 --- a/tests/test_DGEList.py +++ b/tests/test_DGEList.py @@ -221,8 +221,6 @@ def test_tpm(): first_pos = dge_list.counts[0][0] first_gene = dge_list.genes[0] - gene_len_ordered, gene_mask = dge_list.get_gene_mask_and_lengths(icd) - rates = dge_list.get_rates(icd) rate_sum = np.sum(rates) @@ -232,7 +230,7 @@ def test_tpm(): ensg_gene = icd.pick_gene_id(icd.get_genes_from_symbol(first_gene)) gene_len = icd.get_length_of_canonical_transcript(ensg_gene) - # TPM=countsPerBase * (1/sum[countsPerBase]) * 10^6 + # TPM = countsPerBase / sum[countsPerBase]) * 10^6 assert tpm_dge.counts[0][0] == ((first_pos / gene_len) / rate_sum) * 1e6 From 4f88681c78e529aa4e8fd809e39efc9a23a91037 Mon Sep 17 00:00:00 2001 From: zbohannan Date: Sat, 15 Dec 2018 20:53:28 -0600 Subject: [PATCH 10/20] Add kb->b conversion --- edgePy/DGEList.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py index bd2c072..1c6aacd 100644 --- a/edgePy/DGEList.py +++ b/edgePy/DGEList.py @@ -360,7 +360,7 @@ def rpkm( def get_gene_mask_and_lengths(self, gene_data): """ - use gene_data to get the gene lenths and a gene mask for the tranformation. + use gene_data to get the gene lengths and a gene mask for the tranformation. Args: gene_data: the object that holds gene data from ensembl @@ -462,6 +462,8 @@ def get_rates( gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data) counts = self.counts[gene_mask].copy() + gene_len_ordered = gene_len_ordered * 1e3 + # calculates counts per base (ie, the rate) rates = (counts.T / gene_len_ordered).T From 066f915c0629bf3c1ee25d54510a11a7efe4ca19 Mon Sep 17 00:00:00 2001 From: zbohannan Date: Sat, 15 Dec 2018 20:59:53 -0600 Subject: [PATCH 11/20] Adjust TPM for length conversion --- edgePy/DGEList.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py index 1c6aacd..95619a3 100644 --- a/edgePy/DGEList.py +++ b/edgePy/DGEList.py @@ -439,8 +439,8 @@ def tpm( rates = self.get_rates(gene_data) rate_sum = np.sum(rates) - # calculates tpm - counts = (rates / rate_sum) * 1e6 + # calculates tpm and adds kilobase conversion from get_gene_mask_and_lengths + counts = (rates / rate_sum) * 1e9 if transform_to_log: counts = self.log_transform(counts, prior_count) @@ -462,7 +462,6 @@ def get_rates( gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data) counts = self.counts[gene_mask].copy() - gene_len_ordered = gene_len_ordered * 1e3 # calculates counts per base (ie, the rate) rates = (counts.T / gene_len_ordered).T From e1acb5f1cba07ee5965852478dfcd95a36d286c6 Mon Sep 17 00:00:00 2001 From: zbohannan Date: Sat, 15 Dec 2018 21:04:50 -0600 Subject: [PATCH 12/20] Testing kb conversion --- edgePy/DGEList.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py index 95619a3..148bf13 100644 --- a/edgePy/DGEList.py +++ b/edgePy/DGEList.py @@ -440,7 +440,7 @@ def tpm( rate_sum = np.sum(rates) # calculates tpm and adds kilobase conversion from get_gene_mask_and_lengths - counts = (rates / rate_sum) * 1e9 + counts = (rates / rate_sum) * 1e3 if transform_to_log: counts = self.log_transform(counts, prior_count) From 11653f95319e711a8abd8cef7cbaef381d67e2fa Mon Sep 17 00:00:00 2001 From: zbohannan Date: Sat, 15 Dec 2018 21:20:34 -0600 Subject: [PATCH 13/20] Add get_rates test v1 --- tests/test_DGEList.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_DGEList.py b/tests/test_DGEList.py index a581b11..779ad53 100644 --- a/tests/test_DGEList.py +++ b/tests/test_DGEList.py @@ -213,6 +213,22 @@ def test_rpkm(): # RPKM=numReads / (geneLength / 1000 * totalNumReads / 1, 000, 000) assert rpm_dge.counts[0][0] == (first_pos / ((gene_len / 1e3) * (col_sum[0] / 1e6))) +def test_get_rates(): + dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ))) + icd = CanonicalDataStore( + get_dataset_path(TEST_GENE_SET_DATA), get_dataset_path(TEST_GENE_SYMBOLS) + ) + first_pos = dge_list.counts[0][0] + first_gene = dge_list.genes[0] + + assert isinstance(first_pos, np.integer) + ensg_gene = icd.pick_gene_id(icd.get_genes_from_symbol(first_gene)) + gene_len = icd.get_length_of_canonical_transcript(ensg_gene) + + rates = dge_list.get_rates(icd) + + assert rates[0] == first_pos / gene_len + def test_tpm(): dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ))) icd = CanonicalDataStore( From 166ab5c79e82069e32649e7d88e5fc142b9493d7 Mon Sep 17 00:00:00 2001 From: zbohannan Date: Sat, 15 Dec 2018 21:24:14 -0600 Subject: [PATCH 14/20] Test get_rates v2 --- tests/test_DGEList.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_DGEList.py b/tests/test_DGEList.py index 779ad53..4ac3f67 100644 --- a/tests/test_DGEList.py +++ b/tests/test_DGEList.py @@ -227,7 +227,7 @@ def test_get_rates(): rates = dge_list.get_rates(icd) - assert rates[0] == first_pos / gene_len + assert rates[0][0] == first_pos / gene_len def test_tpm(): dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ))) From d94bdbcf20079251d5ea3a957e6064557dd21a28 Mon Sep 17 00:00:00 2001 From: zbohannan Date: Sat, 15 Dec 2018 21:32:46 -0600 Subject: [PATCH 15/20] Test get_rates v3 --- edgePy/DGEList.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py index 148bf13..a7a9e8e 100644 --- a/edgePy/DGEList.py +++ b/edgePy/DGEList.py @@ -464,7 +464,7 @@ def get_rates( # calculates counts per base (ie, the rate) - rates = (counts.T / gene_len_ordered).T + rates = (counts.T / (gene_len_ordered * 1e3)).T return rates From 36af95eafd0b26f1db33c536b73d171ed81764f2 Mon Sep 17 00:00:00 2001 From: zbohannan Date: Sat, 15 Dec 2018 21:38:42 -0600 Subject: [PATCH 16/20] Add get rates test v4 --- edgePy/DGEList.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py index a7a9e8e..0914d63 100644 --- a/edgePy/DGEList.py +++ b/edgePy/DGEList.py @@ -464,8 +464,10 @@ def get_rates( # calculates counts per base (ie, the rate) - rates = (counts.T / (gene_len_ordered * 1e3)).T + rates = (counts.T / gene_len_ordered).T + rates = rates * 1e3 + return rates def __repr__(self) -> str: From 77418a639aea047ba37431694ea94efda8a4d11f Mon Sep 17 00:00:00 2001 From: zbohannan Date: Sat, 15 Dec 2018 21:45:08 -0600 Subject: [PATCH 17/20] Add get rates test v5 --- edgePy/DGEList.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py index 0914d63..fda5a38 100644 --- a/edgePy/DGEList.py +++ b/edgePy/DGEList.py @@ -466,8 +466,8 @@ def get_rates( # calculates counts per base (ie, the rate) rates = (counts.T / gene_len_ordered).T - rates = rates * 1e3 - + rates = rates / 1e3 + return rates def __repr__(self) -> str: From 7da1f8f37d40d7d5891c8528ab23e0ceecd74dd6 Mon Sep 17 00:00:00 2001 From: zbohannan Date: Sat, 15 Dec 2018 21:52:32 -0600 Subject: [PATCH 18/20] Add get rate test v6 --- edgePy/DGEList.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py index fda5a38..4c3d6c4 100644 --- a/edgePy/DGEList.py +++ b/edgePy/DGEList.py @@ -440,7 +440,7 @@ def tpm( rate_sum = np.sum(rates) # calculates tpm and adds kilobase conversion from get_gene_mask_and_lengths - counts = (rates / rate_sum) * 1e3 + counts = (rates / rate_sum) * 1e6 if transform_to_log: counts = self.log_transform(counts, prior_count) From 8295ac97d0f2ed1f1d1d6a54bc65b848913cc975 Mon Sep 17 00:00:00 2001 From: zbohannan Date: Mon, 17 Dec 2018 10:44:18 -0600 Subject: [PATCH 19/20] Reformat --- edgePy/DGEList.py | 83 +++++++++++++++++++++---------------------- tests/test_DGEList.py | 7 ++-- 2 files changed, 44 insertions(+), 46 deletions(-) diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py index 4c3d6c4..aba1a1e 100644 --- a/edgePy/DGEList.py +++ b/edgePy/DGEList.py @@ -52,17 +52,17 @@ class DGEList(object): ) def __init__( - self, - counts: Optional[np.ndarray] = None, - samples: Optional[np.array] = None, - genes: Optional[np.array] = None, - norm_factors: Optional[np.array] = None, - groups_in_list: Optional[np.array] = None, - groups_in_dict: Optional[Dict] = None, - to_remove_zeroes: Optional[bool] = False, - filename: Optional[str] = None, - current_transform_type: Optional[str] = None, - current_log_status: Optional[bool] = False, + self, + counts: Optional[np.ndarray] = None, + samples: Optional[np.array] = None, + genes: Optional[np.array] = None, + norm_factors: Optional[np.array] = None, + groups_in_list: Optional[np.array] = None, + groups_in_dict: Optional[Dict] = None, + to_remove_zeroes: Optional[bool] = False, + filename: Optional[str] = None, + current_transform_type: Optional[str] = None, + current_log_status: Optional[bool] = False, ) -> None: self.to_remove_zeroes = to_remove_zeroes @@ -108,16 +108,16 @@ def __init__( ) def copy( - self, - counts: Optional[np.ndarray] = None, - samples: Optional[np.array] = None, - genes: Optional[np.array] = None, - norm_factors: Optional[np.array] = None, - groups_in_list: Optional[np.array] = None, - groups_in_dict: Optional[Dict] = None, - to_remove_zeroes: Optional[bool] = False, - current_type: Optional[str] = None, - current_log: Optional[bool] = False, + self, + counts: Optional[np.ndarray] = None, + samples: Optional[np.array] = None, + genes: Optional[np.array] = None, + norm_factors: Optional[np.array] = None, + groups_in_list: Optional[np.array] = None, + groups_in_dict: Optional[Dict] = None, + to_remove_zeroes: Optional[bool] = False, + current_type: Optional[str] = None, + current_log: Optional[bool] = False, ) -> "DGEList": return DGEList( @@ -323,10 +323,10 @@ def cpm(self, transform_to_log: bool = False, prior_count: float = PRIOR_COUNT) return self.copy(counts=counts, current_log=current_log) def rpkm( - self, - gene_data: CanonicalDataStore, - transform_to_log: bool = False, - prior_count: float = PRIOR_COUNT, + self, + gene_data: CanonicalDataStore, + transform_to_log: bool = False, + prior_count: float = PRIOR_COUNT, ) -> "DGEList": """Return the DGEList normalized to reads per kilobase of gene length per million reads. (RPKM = numReads / ( geneLength/1000 * totalNumReads/1,000,000 ) @@ -399,10 +399,10 @@ def get_gene_mask_and_lengths(self, gene_data): return gene_len_ordered, gene_mask def tpm( - self, - gene_data: CanonicalDataStore, - transform_to_log: bool = False, - prior_count: float = PRIOR_COUNT, + self, + gene_data: CanonicalDataStore, + transform_to_log: bool = False, + prior_count: float = PRIOR_COUNT, ) -> "DGEList": """Normalize the DGEList to transcripts per million. @@ -449,9 +449,9 @@ def tpm( return self.copy(counts=counts, current_log=current_log, genes=genes) def get_rates( - self, - gene_data: CanonicalDataStore - ) -> "DGEList": + self, + gene_data: CanonicalDataStore + ) -> "DGEList": """Gets the number of counts per base, otherwise known as the rate, for all genes. Currently used for TPM calculations. @@ -462,7 +462,6 @@ def get_rates( gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data) counts = self.counts[gene_mask].copy() - # calculates counts per base (ie, the rate) rates = (counts.T / gene_len_ordered).T @@ -518,12 +517,12 @@ def read_npz_file(self, filename: str) -> None: @classmethod def create_DGEList( - cls, - sample_list: List[str], - data_set: Dict[Hashable, Any], # {sample: {gene1: x, gene2: y}}, - gene_list: List[str], - sample_to_category: Optional[List[str]] = None, - category_to_samples: Optional[Dict[Hashable, List[str]]] = None, + cls, + sample_list: List[str], + data_set: Dict[Hashable, Any], # {sample: {gene1: x, gene2: y}}, + gene_list: List[str], + sample_to_category: Optional[List[str]] = None, + category_to_samples: Optional[Dict[Hashable, List[str]]] = None, ) -> "DGEList": """ sample list and gene list must be pre-sorted Use this to create the DGE object for future work.""" @@ -548,7 +547,7 @@ def create_DGEList( @classmethod def create_DGEList_data_file( - cls, data_file: Path, group_file: Path, **kwargs: Mapping + cls, data_file: Path, group_file: Path, **kwargs: Mapping ) -> "DGEList": """Wrapper for creating DGEList objects from file locations. Performs open and passes the file handles to the method for creating a DGEList object. @@ -567,13 +566,13 @@ def create_DGEList_data_file( """ with smart_open(data_file, 'r') as data_handle, smart_open( - group_file, 'r' + group_file, 'r' ) as group_handle: return cls.create_DGEList_handle(data_handle, group_handle, **kwargs) @classmethod def create_DGEList_handle( - cls, data_handle: StringIO, group_handle: StringIO, **kwargs: Mapping + cls, data_handle: StringIO, group_handle: StringIO, **kwargs: Mapping ) -> "DGEList": """Read in a file-like object of delimited data for instantiation. diff --git a/tests/test_DGEList.py b/tests/test_DGEList.py index 4ac3f67..c54e53b 100644 --- a/tests/test_DGEList.py +++ b/tests/test_DGEList.py @@ -18,7 +18,7 @@ @pytest.fixture def dge_list(): with smart_open(get_dataset_path(TEST_DATASET), 'r') as data_handle, smart_open( - get_dataset_path(TEST_GROUPS), 'r' + get_dataset_path(TEST_GROUPS), 'r' ) as group_handle: return DGEList.create_DGEList_handle(data_handle, group_handle) @@ -40,7 +40,6 @@ def test_sample_group_list(): def test_minimal_init(): - dge_list = DGEList( to_remove_zeroes=False, counts=np.ones(shape=(5, 5)), @@ -92,7 +91,6 @@ def test_library_size(): def test_setting_DGElist_counts(): - dge_list = DGEList( counts=np.zeros(shape=(5, 10)), groups_in_list=['A', 'A', 'B', 'B', 'B'], @@ -116,7 +114,6 @@ def test_setting_DGElist_counts(): def test_cycle_dge_npz(): - import tempfile import os @@ -213,6 +210,7 @@ def test_rpkm(): # RPKM=numReads / (geneLength / 1000 * totalNumReads / 1, 000, 000) assert rpm_dge.counts[0][0] == (first_pos / ((gene_len / 1e3) * (col_sum[0] / 1e6))) + def test_get_rates(): dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ))) icd = CanonicalDataStore( @@ -229,6 +227,7 @@ def test_get_rates(): assert rates[0][0] == first_pos / gene_len + def test_tpm(): dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ))) icd = CanonicalDataStore( From 33699300b6e0ebc73d6ef4790f87a7072f7c2b8e Mon Sep 17 00:00:00 2001 From: zbohannan Date: Mon, 17 Dec 2018 11:44:02 -0600 Subject: [PATCH 20/20] Reformat --- docs/source/conf.py | 1 - edgePy/DGEList.py | 81 ++++++++++++++-------------- tests/mongodb/test_gene_functions.py | 41 ++++++++------ tests/test_DGEList.py | 2 +- 4 files changed, 65 insertions(+), 60 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 6c8991f..bc83afd 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,4 +1,3 @@ - # -*- coding: utf-8 -*- # # Configuration file for the Sphinx documentation builder. diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py index aba1a1e..614a3d8 100644 --- a/edgePy/DGEList.py +++ b/edgePy/DGEList.py @@ -52,17 +52,17 @@ class DGEList(object): ) def __init__( - self, - counts: Optional[np.ndarray] = None, - samples: Optional[np.array] = None, - genes: Optional[np.array] = None, - norm_factors: Optional[np.array] = None, - groups_in_list: Optional[np.array] = None, - groups_in_dict: Optional[Dict] = None, - to_remove_zeroes: Optional[bool] = False, - filename: Optional[str] = None, - current_transform_type: Optional[str] = None, - current_log_status: Optional[bool] = False, + self, + counts: Optional[np.ndarray] = None, + samples: Optional[np.array] = None, + genes: Optional[np.array] = None, + norm_factors: Optional[np.array] = None, + groups_in_list: Optional[np.array] = None, + groups_in_dict: Optional[Dict] = None, + to_remove_zeroes: Optional[bool] = False, + filename: Optional[str] = None, + current_transform_type: Optional[str] = None, + current_log_status: Optional[bool] = False, ) -> None: self.to_remove_zeroes = to_remove_zeroes @@ -108,16 +108,16 @@ def __init__( ) def copy( - self, - counts: Optional[np.ndarray] = None, - samples: Optional[np.array] = None, - genes: Optional[np.array] = None, - norm_factors: Optional[np.array] = None, - groups_in_list: Optional[np.array] = None, - groups_in_dict: Optional[Dict] = None, - to_remove_zeroes: Optional[bool] = False, - current_type: Optional[str] = None, - current_log: Optional[bool] = False, + self, + counts: Optional[np.ndarray] = None, + samples: Optional[np.array] = None, + genes: Optional[np.array] = None, + norm_factors: Optional[np.array] = None, + groups_in_list: Optional[np.array] = None, + groups_in_dict: Optional[Dict] = None, + to_remove_zeroes: Optional[bool] = False, + current_type: Optional[str] = None, + current_log: Optional[bool] = False, ) -> "DGEList": return DGEList( @@ -323,10 +323,10 @@ def cpm(self, transform_to_log: bool = False, prior_count: float = PRIOR_COUNT) return self.copy(counts=counts, current_log=current_log) def rpkm( - self, - gene_data: CanonicalDataStore, - transform_to_log: bool = False, - prior_count: float = PRIOR_COUNT, + self, + gene_data: CanonicalDataStore, + transform_to_log: bool = False, + prior_count: float = PRIOR_COUNT, ) -> "DGEList": """Return the DGEList normalized to reads per kilobase of gene length per million reads. (RPKM = numReads / ( geneLength/1000 * totalNumReads/1,000,000 ) @@ -399,10 +399,10 @@ def get_gene_mask_and_lengths(self, gene_data): return gene_len_ordered, gene_mask def tpm( - self, - gene_data: CanonicalDataStore, - transform_to_log: bool = False, - prior_count: float = PRIOR_COUNT, + self, + gene_data: CanonicalDataStore, + transform_to_log: bool = False, + prior_count: float = PRIOR_COUNT, ) -> "DGEList": """Normalize the DGEList to transcripts per million. @@ -448,10 +448,7 @@ def tpm( return self.copy(counts=counts, current_log=current_log, genes=genes) - def get_rates( - self, - gene_data: CanonicalDataStore - ) -> "DGEList": + def get_rates(self, gene_data: CanonicalDataStore) -> "DGEList": """Gets the number of counts per base, otherwise known as the rate, for all genes. Currently used for TPM calculations. @@ -517,12 +514,12 @@ def read_npz_file(self, filename: str) -> None: @classmethod def create_DGEList( - cls, - sample_list: List[str], - data_set: Dict[Hashable, Any], # {sample: {gene1: x, gene2: y}}, - gene_list: List[str], - sample_to_category: Optional[List[str]] = None, - category_to_samples: Optional[Dict[Hashable, List[str]]] = None, + cls, + sample_list: List[str], + data_set: Dict[Hashable, Any], # {sample: {gene1: x, gene2: y}}, + gene_list: List[str], + sample_to_category: Optional[List[str]] = None, + category_to_samples: Optional[Dict[Hashable, List[str]]] = None, ) -> "DGEList": """ sample list and gene list must be pre-sorted Use this to create the DGE object for future work.""" @@ -547,7 +544,7 @@ def create_DGEList( @classmethod def create_DGEList_data_file( - cls, data_file: Path, group_file: Path, **kwargs: Mapping + cls, data_file: Path, group_file: Path, **kwargs: Mapping ) -> "DGEList": """Wrapper for creating DGEList objects from file locations. Performs open and passes the file handles to the method for creating a DGEList object. @@ -566,13 +563,13 @@ def create_DGEList_data_file( """ with smart_open(data_file, 'r') as data_handle, smart_open( - group_file, 'r' + group_file, 'r' ) as group_handle: return cls.create_DGEList_handle(data_handle, group_handle, **kwargs) @classmethod def create_DGEList_handle( - cls, data_handle: StringIO, group_handle: StringIO, **kwargs: Mapping + cls, data_handle: StringIO, group_handle: StringIO, **kwargs: Mapping ) -> "DGEList": """Read in a file-like object of delimited data for instantiation. diff --git a/tests/mongodb/test_gene_functions.py b/tests/mongodb/test_gene_functions.py index 9a41aa2..a667a22 100644 --- a/tests/mongodb/test_gene_functions.py +++ b/tests/mongodb/test_gene_functions.py @@ -16,28 +16,34 @@ "size": 720, "canonical": "0", "exons": { - "ENSE00002642039": {"raw": 6.435643564356435, "rpkm": 0.3682833603326866}, - "ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676}, + "ENSE00002642039": {"raw": 6.435_643_564_356_435, "rpkm": 0.368_283_360_332_686_6}, + "ENSE00002663544": {"raw": 1.960_896_089_608_961, "rpkm": 0.518_986_943_091_667_6}, }, - "rpkm": 0.39507510837872767, + "rpkm": 0.395_075_108_378_727_67, }, "ENST00000576696": { "size": 1306, "canonical": "0", "exons": { - "ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676}, - "ENSE00002672617": {"raw": 5.564356435643564, "rpkm": 0.16002265523850875}, + "ENSE00002663544": {"raw": 1.960_896_089_608_961, "rpkm": 0.518_986_943_091_667_6}, + "ENSE00002672617": { + "raw": 5.564_356_435_643_564, + "rpkm": 0.160_022_655_238_508_75, + }, }, - "rpkm": 0.195204453741728, + "rpkm": 0.195_204_453_741_728, }, "ENST00000443778": { "size": 2084, "canonical": "1", "exons": { - "ENSE00001729822": {"raw": 2, "rpkm": 0.0997865579744511}, - "ENSE00001608298": {"raw": 1.1777177717771776, "rpkm": 0.22669418591124607}, + "ENSE00001729822": {"raw": 2, "rpkm": 0.099_786_557_974_451_1}, + "ENSE00001608298": { + "raw": 1.177_717_771_777_177_6, + "rpkm": 0.226_694_185_911_246_07, + }, }, - "rpkm": 0.05165702955135873, + "rpkm": 0.051_657_029_551_358_73, }, }, "star_rpkm": None, @@ -53,19 +59,22 @@ "size": 720, "canonical": "0", "exons": { - "ENSE00002642039": {"raw": 6.435643564356435, "rpkm": 0.3682833603326866}, - "ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676}, + "ENSE00002642039": {"raw": 6.435_643_564_356_435, "rpkm": 0.368_283_360_332_686_6}, + "ENSE00002663544": {"raw": 1.960_896_089_608_961, "rpkm": 0.518_986_943_091_667_6}, }, - "rpkm": 0.39507510837872767, + "rpkm": 0.395_075_108_378_727_67, }, "ENST00000576696": { "size": 1306, "canonical": "0", "exons": { - "ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676}, - "ENSE00002672617": {"raw": 5.564356435643564, "rpkm": 0.16002265523850875}, + "ENSE00002663544": {"raw": 1.960_896_089_608_961, "rpkm": 0.518_986_943_091_667_6}, + "ENSE00002672617": { + "raw": 5.564_356_435_643_564, + "rpkm": 0.160_022_655_238_508_75, + }, }, - "rpkm": 0.195204453741728, + "rpkm": 0.195_204_453_741_728, }, }, "star_rpkm": None, @@ -137,7 +146,7 @@ def test_get_sample_details(mongodb): def test_get_canonical_rpkm(): rpkm = get_canonical_rpkm(RNASeq_RECORD) - assert rpkm == 0.05165702955135873 + assert rpkm == 0.051_657_029_551_358_73 def test_get_canonical_rpkm_no_canonical(): diff --git a/tests/test_DGEList.py b/tests/test_DGEList.py index c54e53b..bb3e00a 100644 --- a/tests/test_DGEList.py +++ b/tests/test_DGEList.py @@ -18,7 +18,7 @@ @pytest.fixture def dge_list(): with smart_open(get_dataset_path(TEST_DATASET), 'r') as data_handle, smart_open( - get_dataset_path(TEST_GROUPS), 'r' + get_dataset_path(TEST_GROUPS), 'r' ) as group_handle: return DGEList.create_DGEList_handle(data_handle, group_handle)