diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..0f0d0eb --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,7 @@ +^\.github$ +^data-raw$ +^environment\.yml$ +^\.git$ +^.*\.Rproj$ +^\.Rproj\.user$ +^LICENSE\.md$ diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml new file mode 100644 index 0000000..00d906f --- /dev/null +++ b/.github/workflows/R-CMD-check.yaml @@ -0,0 +1,49 @@ +on: + push: + branches: [main, master, restructure] + pull_request: + branches: [main, master, restructure] + +name: R-CMD-check + +permissions: read-all + +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + + name: ${{ matrix.config.os }} (R ${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: ubuntu-latest, r: 'release'} + - {os: ubuntu-latest, r: 'devel'} + - {os: macos-latest, r: 'release'} + - {os: windows-latest, r: 'release'} + + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + R_KEEP_PKG_SOURCE: yes + + steps: + - uses: actions/checkout@v4 + + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.config.r }} + use-public-rspm: true + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::rcmdcheck + needs: check + + - uses: r-lib/actions/check-r-package@v2 + with: + args: 'c("--no-manual", "--as-cran")' + error-on: '"warning"' + upload-snapshots: true diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml new file mode 100644 index 0000000..1b5f116 --- /dev/null +++ b/.github/workflows/release.yaml @@ -0,0 +1,63 @@ +on: + push: + tags: + - 'v*' + +name: Release + +permissions: + contents: write + +jobs: + release: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - uses: r-lib/actions/setup-r@v2 + with: + r-version: 'release' + use-public-rspm: true + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::rcmdcheck + needs: check + + - name: Check package + shell: Rscript {0} + run: | + rcmdcheck::rcmdcheck( + args = c("--no-manual", "--as-cran"), + error_on = "warning" + ) + + - name: Build source tarball + run: R CMD build . + + - name: Get package version and tarball name + id: pkg + run: | + VERSION=$(grep "^Version:" DESCRIPTION | sed 's/Version:[[:space:]]*//') + echo "version=$VERSION" >> "$GITHUB_OUTPUT" + echo "tarball=DeOPUS_${VERSION}.tar.gz" >> "$GITHUB_OUTPUT" + + - name: Create GitHub Release + uses: softprops/action-gh-release@v2 + with: + name: "DeOPUS ${{ steps.pkg.outputs.version }}" + body: | + ## DeOPUS ${{ steps.pkg.outputs.version }} + + See [NEWS.md](https://github.com/tinnlab/DeOPUS/blob/main/NEWS.md) for a full list of changes. + + ### Installation + + ```r + devtools::install_github("tinnlab/DeOPUS@${{ github.ref_name }}") + ``` + files: ${{ steps.pkg.outputs.tarball }} + fail_on_unmatched_files: true diff --git a/.gitignore b/.gitignore index e43b0f9..3094af6 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,8 @@ .DS_Store +.Rproj.user +.Rhistory +.RData +.Ruserdata +*.Rproj +/doc/ +/Meta/ diff --git a/DeOPUS/DESCRIPTION b/DESCRIPTION similarity index 81% rename from DeOPUS/DESCRIPTION rename to DESCRIPTION index 84ddc1f..3534a23 100644 --- a/DeOPUS/DESCRIPTION +++ b/DESCRIPTION @@ -5,27 +5,28 @@ Version: 1.0.0 Authors@R: c( person("Ha", "Nguyen", email = "hvn0006@wayne.edu", role = c("aut", "cre")) ) -Description: A reference-based cellular deconvolution method that employs - hierarchical shrinkage transformation to robustly estimate cell-type - proportions from bulk RNA sequencing data. DeOPUS applies a multi-level - adaptive transformation combining local and global shrinkage priors, - variance-stabilizing power transformation, and quantile normalization +Description: A reference-based cellular deconvolution method that employs + hierarchical shrinkage transformation to robustly estimate cell-type + proportions from bulk RNA sequencing data. DeOPUS applies a multi-level + adaptive transformation combining local and global shrinkage priors, + variance-stabilizing power transformation, and quantile normalization to minimize the influence of outliers and high-variance genes. License: MIT + file LICENSE Encoding: UTF-8 +Depends: R (>= 3.5.0) LazyData: true -RoxygenNote: 7.2.3 +LazyDataCompression: xz +RoxygenNote: 7.3.3 Imports: Matrix, - dplyr, parallel, stats Suggests: testthat (>= 3.0.0), knitr, rmarkdown, - ggplot2, - tidyverse + ggplot2 VignetteBuilder: knitr -URL: https://github.com/tinnlab/DeOPUS/DeOPUS +Config/testthat/edition: 3 +URL: https://github.com/tinnlab/DeOPUS BugReports: https://github.com/tinnlab/DeOPUS/issues diff --git a/DeOPUS/.gitignore b/DeOPUS/.gitignore deleted file mode 100644 index ae972be..0000000 --- a/DeOPUS/.gitignore +++ /dev/null @@ -1,52 +0,0 @@ -# R specific -.Rproj.user -.Rhistory -.RData -.Ruserdata -*.Rproj - -# Package build files -*.tar.gz -*.tgz -DeOPUS.Rcheck/ -DeOPUS_*.tar.gz - -# Generated documentation -man/*.Rd -NAMESPACE - -# OS specific -.DS_Store -Thumbs.db - -# IDE specific -.idea/ -.vscode/ -*.swp -*.swo -*~ - -# Data files (large) -data/real_datasets/*.rds -data/simulated_datasets/*.rds -results/ - -# Temporary files -*.tmp -*.bak -*.log - -# Output files -*.pdf -*.png -*.jpg -!vignettes/*.png - -# Compiled code -*.o -*.so -*.dll - -# Sensitive information -*.lic -license.lic diff --git a/DeOPUS/R/deconvolve.R b/DeOPUS/R/deconvolve.R deleted file mode 100644 index a2eb369..0000000 --- a/DeOPUS/R/deconvolve.R +++ /dev/null @@ -1,220 +0,0 @@ -#' Deconvolve bulk RNA-seq data using DeOPUS -#' -#' @description -#' Estimates cell-type proportions from bulk RNA-seq data using a reference-based -#' approach with hierarchical shrinkage transformation. -#' -#' @param bulk A numeric matrix of bulk expression data (genes x samples). -#' @param reference A numeric matrix of reference expression profiles (genes x cell types). -#' @param alpha Numeric. Regularization parameter for transformation. Default is 0.01. -#' @param power Numeric. Power for loss function (1 = MAE, 2 = MSE). Default is 2. -#' @param n_cores Integer. Number of cores for parallel processing. Default is 1. -#' @param maxit Integer. Maximum number of optimization iterations. Default is 100. -#' @param verbose Logical. Whether to print progress messages. Default is FALSE. -#' -#' @return A list containing: -#' \itemize{ -#' \item \code{proportions}: Matrix of estimated cell-type proportions (samples x cell types) -#' \item \code{convergence}: List of convergence information for each sample -#' } -#' -#' @details -#' DeOPUS applies a multi-level adaptive transformation combining: -#' \itemize{ -#' \item Local shrinkage: Attenuates influence of high-variance genes -#' \item Global shrinkage: Provides overall regularization -#' \item Power transformation: Stabilizes variance -#' \item Quantile normalization: Ensures robust profile comparison -#' } -#' -#' @examples -#' \dontrun{ -#' # Load example data -#' data(example_bulk) -#' data(example_reference) -#' -#' # Run deconvolution -#' results <- deconvolve( -#' bulk = example_bulk, -#' reference = example_reference, -#' n_cores = 4 -#' ) -#' -#' # View proportions -#' head(results$proportions) -#' } -#' -#' @export -deconvolve <- function(bulk, - reference, - alpha = 0.01, - power = 2, - n_cores = 1, - maxit = 100, - verbose = FALSE) { - - - # Input validation - if (!is.matrix(bulk)) { - bulk <- as.matrix(bulk) - } - if (!is.matrix(reference)) { - reference <- as.matrix(reference) - } - - # Check for common genes - -common_genes <- intersect(rownames(bulk), rownames(reference)) - if (length(common_genes) == 0) { - stop("No common genes found between bulk and reference matrices.") - } - - if (verbose) { - message(sprintf("Using %d common genes for deconvolution", length(common_genes))) - } - - # Subset to common genes - bulk <- bulk[common_genes, , drop = FALSE] - reference <- reference[common_genes, , drop = FALSE] - - # Get dimensions - n_samples <- ncol(bulk) - n_cell_types <- ncol(reference) - cell_type_names <- colnames(reference) - sample_names <- colnames(bulk) - - # Define transformation function - transform_func <- function(q, alpha = 0.01) { - q[q < 0] <- 0 - - # Normalize by total counts - scale_factor <- mean(q) * 100 - if (scale_factor == 0) scale_factor <- 1 - - # Shrinkage parameters - tau <- 0.05 # Local shrinkage parameter - phi <- 0.8 # Global shrinkage intensity - omega <- 1.5 # Shape modulation factor - delta <- 0.3 # Adaptive regularization strength - - # Dispersion and shape parameters - kappa <- 0.1 - mu <- 2.0 - - # Hierarchical shrinkage adjustment - local_shrink <- tau / (tau + q / scale_factor) - global_shrink <- (1 - phi) * exp(-delta * q / scale_factor) + phi - - # Modified power transformation with adaptive hierarchical shrinkage - z <- ((q / scale_factor + kappa)^(mu * omega * global_shrink) - kappa^mu) / (mu * global_shrink) - - # Apply spline-based normalization - ranks <- rank(z) / length(z) - quantile_norm <- qnorm(pmax(0.001, pmin(0.999, ranks))) - - # Enhanced blending transformation with multi-level shrinkage - signal_weight <- pmin(1, q / (10 + alpha * q)) * (1 - local_shrink) - - # Incorporate prior information shrinkage - transformed <- signal_weight * z + (1 - signal_weight) * - global_shrink * - quantile_norm - - # Apply edge-preserving smoothing via shrinkage - transformed <- transformed / (1 + delta * abs(transformed)) - - # Scale to approximate log2 scale with adaptive shrinkage - transformed <- transformed / max(abs(transformed), na.rm = TRUE) * log2(max(q) + 1) - - return(transformed) - } - - # Define loss function - loss_function <- function(p, b, reference, alpha, power, transform_func) { - # Apply transformation to bulk - b_transformed <- transform_func(b, alpha) - - # Compute fitted values - fitted <- as.vector(reference %*% p) - fitted_transformed <- transform_func(fitted, alpha) - - # Calculate loss - return(sum(abs(b_transformed - fitted_transformed)^power)) - } - - # Run deconvolution for each sample - if (verbose) { - message(sprintf("Deconvolving %d samples using %d cores...", n_samples, n_cores)) - } - - results <- parallel::mclapply(seq_len(n_samples), mc.cores = n_cores, mc.preschedule = FALSE, function(i) { - result <- tryCatch({ - optim_result <- optim( - par = rep(1 / n_cell_types, n_cell_types), - fn = loss_function, - b = bulk[, i], - reference = reference, - alpha = alpha, - power = power, - transform_func = transform_func, - method = "L-BFGS-B", - lower = 0, - upper = 100, - control = list(maxit = maxit) - ) - - list( - proportions = optim_result$par, - convergence = optim_result$convergence, - value = optim_result$value - ) - }, error = function(e) { - list( - proportions = rep(0, n_cell_types), - convergence = -1, - value = NA, - error = conditionMessage(e) - ) - }) - - return(result) - }) - - # Extract proportions matrix - proportions <- do.call(rbind, lapply(results, function(x) x$proportions)) - - # Normalize to sum to 1 - row_sums <- rowSums(proportions) - row_sums[row_sums == 0] <- 1 # Avoid division by zero - proportions <- proportions / row_sums - - # Handle NAs - proportions[is.na(proportions)] <- 0 - - # Set names - colnames(proportions) <- cell_type_names - rownames(proportions) <- sample_names - - # Extract convergence info - convergence <- lapply(results, function(x) { - list( - convergence = x$convergence, - value = x$value, - error = x$error %||% NULL - ) - }) - names(convergence) <- sample_names - - if (verbose) { - n_converged <- sum(sapply(results, function(x) x$convergence == 0)) - message(sprintf("Deconvolution complete. %d/%d samples converged.", n_converged, n_samples)) - } - - return(list( - proportions = proportions, - convergence = convergence - )) -} - -#' @keywords internal -`%||%` <- function(a, b) if (is.null(a)) b else a diff --git a/DeOPUS/README.md b/DeOPUS/README.md deleted file mode 100644 index f3b4709..0000000 --- a/DeOPUS/README.md +++ /dev/null @@ -1,168 +0,0 @@ -# DeOPUS: Deconvolution via Optimized Power-transformed Unmixing with Shrinkage - -[![R](https://img.shields.io/badge/R-%3E%3D4.0-blue.svg)](https://www.r-project.org/) -[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) - -DeOPUS is a reference-based cellular deconvolution method that employs hierarchical shrinkage transformation to robustly estimate cell-type proportions from bulk RNA sequencing data. - -## Overview - -Single-cell RNA sequencing enables comprehensive transcriptomic profiling at single-cell resolution, but high costs limit its widespread application. DeOPUS offers a cost-effective alternative by computationally estimating cell-type proportions from bulk RNA-seq data. - -**Key features:** -- Hierarchical shrinkage transformation with local and global priors -- Variance-stabilizing power transformation -- Quantile normalization to minimize outlier influence -- Robust performance across diverse tissues and cell-type complexities - -## Installation - -### From GitHub - -```r -# Install devtools if not already installed -install.packages("devtools") - -# Install DeOPUS -devtools::install_github("tinnlab/DeOPUS") -``` - -### Dependencies - -```r -install.packages(c("Matrix", "dplyr", "parallel")) -``` - -## Quick Start - -```r -library(DeOPUS) - -# Load your data -# cellTypeExpr: reference expression matrix (genes x cell types) -# bulk: bulk expression matrix (genes x samples) - -# Run deconvolution -results <- deconvolve( - bulk = bulk_matrix, - reference = reference_matrix, - alpha = 0.01, - n_cores = 4 -) - -# View estimated proportions -head(results$proportions) -``` - -## Input Data Format - -DeOPUS requires two main inputs: - -1. **Reference expression matrix** (`reference`): A genes × cell types matrix containing average expression profiles for each cell type, typically derived from scRNA-seq data. - -2. **Bulk expression matrix** (`bulk`): A genes × samples matrix containing bulk RNA-seq expression data to be deconvolved. - -Both matrices should: -- Have matching gene identifiers (rownames) -- Be in linear scale (not log-transformed) -- Contain non-negative values - -## Method Details - -DeOPUS applies a multi-level adaptive transformation: - -1. **Local shrinkage**: Attenuates the influence of individual high-variance genes -2. **Global shrinkage**: Provides overall regularization across the expression profile -3. **Power transformation**: Stabilizes variance across the dynamic range -4. **Quantile normalization**: Ensures robust comparison between predicted and observed profiles - -The optimization minimizes the weighted loss between transformed bulk and reconstructed expression profiles using L-BFGS-B with box constraints. - -## Benchmarking - -We benchmarked DeOPUS against six state-of-the-art methods: -- MuSiC -- AutoGeneS -- CIBERSORT -- FARDEEP -- scaden -- AdRoit - -### Running Benchmarks - -```r -# Run benchmark on simulated data -source("scripts/benchmark/run_benchmark_simulated.R") - -# Run benchmark on real data -source("scripts/benchmark/run_benchmark_real.R") - -# Generate visualization dashboard -source("scripts/analysis/visualize_results.R") -results <- create_summary_barplot_dashboard(benchmark_data) -``` - -## Reproducing Paper Results - -To reproduce the results from our paper: - -```bash -# Clone the repository -git clone https://github.com/tinnlab/DeOPUS.git -cd DeOPUS - -# Run the complete benchmark pipeline -Rscript scripts/benchmark/run_all_benchmarks.R - -# Generate figures -Rscript scripts/analysis/generate_figures.R -``` - -### Data Availability - -Benchmark datasets are available at https://doi.org/10.5281/zenodo.19050845 or can be generated using: - -```r -source("scripts/data/prepare_benchmark_data.R") -``` - -## Output - -DeOPUS returns a list containing: - -- `proportions`: Matrix of estimated cell-type proportions (samples × cell types) -- `convergence`: Optimization convergence information for each sample - -## Parameters - -| Parameter | Default | Description | -|-----------|---------|-------------| -| `alpha` | 0.01 | Regularization parameter for transformation | -| `n_cores` | 1 | Number of parallel cores | -| `maxit` | 100 | Maximum optimization iterations | -| `power` | 2 | Loss function power (1=MAE, 2=MSE) | - -## Citation - -If you use DeOPUS in your research, please cite: - -```bibtex -@article{DeOPUS2025, - title={DeOPUS: Deconvolution via Optimized Power-transformed Unmixing with Shrinkage}, - author={}, - journal={}, - year={2025}, - doi={} -} -``` - -[//]: # (## License) - -[//]: # () -[//]: # (This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.) - -[//]: # () -[//]: # (## Contact) - -[//]: # () -[//]: # (For questions or issues, please open an issue on GitHub or contact [email].) diff --git a/DeOPUS/tests/testthat/test_deconvolve.R b/DeOPUS/tests/testthat/test_deconvolve.R deleted file mode 100644 index 4a09b03..0000000 --- a/DeOPUS/tests/testthat/test_deconvolve.R +++ /dev/null @@ -1,98 +0,0 @@ -library(testthat) -library(DeOPUS) - -test_that("deconvolve works with simple input", { - - # Create simple test data - set.seed(42) - n_genes <- 100 - n_cell_types <- 3 - n_samples <- 5 - - # Reference matrix - reference <- matrix( - rpois(n_genes * n_cell_types, lambda = 100), - nrow = n_genes, - ncol = n_cell_types - ) - rownames(reference) <- paste0("Gene", 1:n_genes) - colnames(reference) <- paste0("CellType", 1:n_cell_types) - - # True proportions - true_props <- matrix( - c(0.5, 0.3, 0.2, - 0.2, 0.5, 0.3, - 0.3, 0.3, 0.4, - 0.6, 0.2, 0.2, - 0.1, 0.6, 0.3), - nrow = n_samples, - ncol = n_cell_types, - byrow = TRUE - ) - - # Generate bulk - bulk <- reference %*% t(true_props) + matrix(rnorm(n_genes * n_samples, sd = 5), nrow = n_genes) - bulk[bulk < 0] <- 0 - colnames(bulk) <- paste0("Sample", 1:n_samples) - - # Run deconvolution - results <- deconvolve( - bulk = bulk, - reference = reference, - n_cores = 1, - maxit = 50 - ) - - # Check output structure - expect_true(is.list(results)) - expect_true("proportions" %in% names(results)) - expect_true("convergence" %in% names(results)) - - # Check dimensions - expect_equal(nrow(results$proportions), n_samples) - expect_equal(ncol(results$proportions), n_cell_types) - - # Check proportions sum to 1 - row_sums <- rowSums(results$proportions) - expect_true(all(abs(row_sums - 1) < 1e-6)) - - # Check proportions are non-negative - expect_true(all(results$proportions >= 0)) -}) - -test_that("deconvolve handles mismatched genes", { - - set.seed(42) - - # Reference with genes 1-100 - reference <- matrix(rpois(100 * 3, lambda = 100), nrow = 100, ncol = 3) - rownames(reference) <- paste0("Gene", 1:100) - colnames(reference) <- paste0("CellType", 1:3) - - # Bulk with genes 50-150 (only 50 overlap) - bulk <- matrix(rpois(100 * 2, lambda = 100), nrow = 100, ncol = 2) - rownames(bulk) <- paste0("Gene", 50:149) - colnames(bulk) <- paste0("Sample", 1:2) - - # Should work with overlapping genes - results <- deconvolve(bulk = bulk, reference = reference, n_cores = 1) - - expect_equal(nrow(results$proportions), 2) - expect_equal(ncol(results$proportions), 3) -}) - -test_that("deconvolve errors with no common genes", { - - reference <- matrix(1:9, nrow = 3, ncol = 3) - rownames(reference) <- c("GeneA", "GeneB", "GeneC") - colnames(reference) <- paste0("CellType", 1:3) - - bulk <- matrix(1:6, nrow = 3, ncol = 2) - rownames(bulk) <- c("GeneX", "GeneY", "GeneZ") - colnames(bulk) <- paste0("Sample", 1:2) - - expect_error( - deconvolve(bulk = bulk, reference = reference), - "No common genes" - ) -}) diff --git a/DeOPUS/vignettes/getting_started.Rmd b/DeOPUS/vignettes/getting_started.Rmd deleted file mode 100644 index efb3c60..0000000 --- a/DeOPUS/vignettes/getting_started.Rmd +++ /dev/null @@ -1,171 +0,0 @@ ---- -title: "Getting Started with DeOPUS" -author: "Author Name" -date: "`r Sys.Date()`" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Getting Started with DeOPUS} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r setup, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>", - fig.width = 7, - fig.height = 5 -) -``` - -## Introduction -DeOPUS (Deconvolution via Optimized Power-transformed Unmixing with Shrinkage) is a reference-based cellular deconvolution method that estimates cell-type proportions from bulk RNA sequencing data. - -## Installation - -Install DeOPUS from GitHub: - -```{r install, eval=FALSE} -devtools::install_github("yourusername/DeOPUS") -``` - -Load the package: - -```{r load} -library(DeOPUS) -``` - -## Quick Example - -### Prepare Input Data - -DeOPUS requires two inputs: - -1. **Reference matrix**: Average expression profiles for each cell type (genes × cell types) -2. **Bulk matrix**: Bulk RNA-seq expression data (genes × samples) - -```{r data, eval=FALSE} -# Example: Load your data -# reference <- readRDS("path/to/reference.rds") -# bulk <- readRDS("path/to/bulk.rds") - -# Create example data -set.seed(42) -n_genes <- 1000 -n_cell_types <- 5 -n_samples <- 10 - -# Simulated reference (genes x cell types) -reference <- matrix( - rpois(n_genes * n_cell_types, lambda = 100), - nrow = n_genes, - ncol = n_cell_types -) -rownames(reference) <- paste0("Gene", 1:n_genes) -colnames(reference) <- paste0("CellType", 1:n_cell_types) - -# True proportions -true_props <- matrix( - runif(n_samples * n_cell_types), - nrow = n_samples, - ncol = n_cell_types -) -true_props <- true_props / rowSums(true_props) - -# Simulated bulk (genes x samples) = reference %*% t(proportions) + noise -bulk <- reference %*% t(true_props) + matrix(rnorm(n_genes * n_samples, sd = 10), nrow = n_genes) -bulk[bulk < 0] <- 0 -colnames(bulk) <- paste0("Sample", 1:n_samples) -``` - -### Run Deconvolution - -```{r deconvolve, eval=FALSE} -# Run DeOPUS -results <- deconvolve( - bulk = bulk, - reference = reference, - alpha = 0.01, - n_cores = 4, - verbose = TRUE -) - -# View estimated proportions -print(results$proportions) -``` - -### Evaluate Results - -```{r evaluate, eval=FALSE} -# Compare with true proportions -estimated <- results$proportions - -# Calculate correlation -cor_values <- sapply(1:n_samples, function(i) { - cor(estimated[i, ], true_props[i, ], method = "pearson") -}) - -cat("Mean Pearson correlation:", mean(cor_values), "\n") -``` - -## Method Details - -DeOPUS applies a multi-level adaptive transformation: - -### 1. Hierarchical Shrinkage - -The transformation incorporates both local and global shrinkage: - -- **Local shrinkage** ($\tau$): Attenuates the influence of individual high-variance genes -- **Global shrinkage** ($\phi$): Provides overall regularization across the expression profile - -### 2. Power Transformation - -A variance-stabilizing power transformation handles the high dynamic range: - -$$z = \frac{(q/s + \kappa)^{\mu \omega g} - \kappa^\mu}{\mu g}$$ - -where $q$ is the expression value, $s$ is a scale factor, and $g$ is the global shrinkage weight. - -### 3. Quantile Normalization - -Spline-based quantile normalization ensures robust comparison between profiles: - -```{r qnorm, eval=FALSE} -ranks <- rank(z) / length(z) -quantile_norm <- qnorm(pmax(0.001, pmin(0.999, ranks))) -``` - -### 4. Optimization - -DeOPUS uses L-BFGS-B optimization with box constraints (proportions ≥ 0) to minimize the loss between transformed bulk and reconstructed expression. - -## Parameters - -| Parameter | Default | Description | -|-----------|---------|-------------| -| `alpha` | 0.01 | Regularization parameter | -| `power` | 2 | Loss function power (1=MAE, 2=MSE) | -| `n_cores` | 1 | Parallel processing cores | -| `maxit` | 100 | Maximum iterations | - -## Benchmarking - -To reproduce the benchmark results from our paper: - -```{r benchmark, eval=FALSE} -# Run benchmark on real datasets -source("scripts/benchmark/run_benchmark_real.R") -run_benchmark() - -# Generate visualization -source("scripts/analysis/visualize_results.R") -results <- aggregate_results() -dashboard <- create_summary_barplot_dashboard(results) -``` - -## Session Info - -```{r sessioninfo} -sessionInfo() -``` diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..4745632 --- /dev/null +++ b/LICENSE @@ -0,0 +1,2 @@ +YEAR: 2025 +COPYRIGHT HOLDER: Ha Nguyen diff --git a/DeOPUS/LICENSE b/LICENSE.md similarity index 95% rename from DeOPUS/LICENSE rename to LICENSE.md index bb2eac1..288c3bb 100644 --- a/DeOPUS/LICENSE +++ b/LICENSE.md @@ -1,6 +1,6 @@ -MIT License +# MIT License -Copyright (c) 2025 [Author Name] +Copyright (c) 2025 Ha Nguyen Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..563b10b --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,7 @@ +# Generated by roxygen2: do not edit by hand + +export(deconvolve) +importFrom(Matrix,sparseMatrix) +importFrom(parallel,mclapply) +importFrom(stats,optim) +importFrom(stats,qnorm) diff --git a/DeOPUS/NEWS.md b/NEWS.md similarity index 81% rename from DeOPUS/NEWS.md rename to NEWS.md index 0a6030e..4932a7c 100644 --- a/DeOPUS/NEWS.md +++ b/NEWS.md @@ -5,9 +5,11 @@ * First public release of DeOPUS (Deconvolution via Optimized Power-transformed Unmixing with Shrinkage) * Core deconvolution algorithm with hierarchical shrinkage transformation * Parallel processing support via `parallel::mclapply` -* Comprehensive benchmarking scripts for real and simulated datasets +* Built-in sample benchmark dataset (`sampleData`) for demonstration and testing +* Comprehensive benchmarking scripts in `inst/scripts/` * Visualization dashboard for benchmark results * Full documentation and vignettes +* GitHub Actions for CRAN-style checks on Linux/macOS/Windows and automated releases on version tags ## Key Features diff --git a/R/data.R b/R/data.R new file mode 100644 index 0000000..daa1011 --- /dev/null +++ b/R/data.R @@ -0,0 +1,66 @@ +#' Sample bulk RNA-seq deconvolution benchmark dataset +#' +#' A simulated benchmark dataset with known ground-truth cell-type proportions +#' for evaluating and demonstrating the DeOPUS deconvolution method. +#' +#' @format A list with the following components: +#' \describe{ +#' \item{bulk}{Numeric matrix (11852 genes x 512 samples) of simulated bulk +#' RNA-seq expression values in linear scale. Row names are Ensembl gene IDs +#' (e.g. ENSG00000000419); column names are Mixture1 through Mixture512.} +#' \item{bulkRatio}{Numeric matrix (2 cell types x 512 samples) containing +#' the ground-truth cell-type proportions used to generate \code{bulk}. +#' Row names are \code{"mature NK T cell"} and \code{"mesenchymal stem cell"}. +#' Note: row order may differ from the column order of \code{cellTypeExpr}, +#' so align by name before comparing predictions to ground truth.} +#' \item{cellTypeExpr}{Numeric matrix (11852 genes x 2 cell types) of reference +#' expression profiles derived from single-cell RNA-seq data. +#' Column names match (but may be in a different order from) the row names of +#' \code{bulkRatio}.} +#' \item{signature}{Numeric matrix (974 signature genes x 2 cell types) of +#' the most discriminative marker genes for each cell type.} +#' \item{nCellTypes}{Integer. Number of cell types (2).} +#' \item{markers}{Named list of marker gene vectors, one entry per cell type.} +#' \item{sigGenes}{Character vector of signature gene names.} +#' \item{singleCellExpr}{Sparse matrix (dgCMatrix) of the single-cell RNA-seq +#' expression data used to derive the reference profiles.} +#' \item{singleCellLabels}{Character vector of cell-type labels for each cell +#' in \code{singleCellExpr}.} +#' \item{singleCellSubjects}{Character vector of donor/subject IDs for each +#' cell in \code{singleCellExpr}.} +#' } +#' +#' @usage data(sampleData) +#' +#' @examples +#' data(sampleData) +#' +#' # Inspect the dataset +#' dim(sampleData$bulk) # 11852 x 512 +#' dim(sampleData$cellTypeExpr) # 11852 x 2 +#' +#' # Run deconvolution on a small subset for speed +#' set.seed(42) +#' idx <- sample(ncol(sampleData$bulk), 10) +#' results <- deconvolve( +#' bulk = sampleData$bulk[, idx], +#' reference = sampleData$cellTypeExpr, +#' n_cores = 1, +#' maxit = 50 +#' ) +#' +#' # Align bulkRatio cell-type ORDER to match results$proportions columns +#' # (bulkRatio rows and cellTypeExpr columns may be in a different order). +#' ct <- colnames(results$proportions) +#' true_props <- t(sampleData$bulkRatio[ct, idx, drop = FALSE]) +#' +#' # Per-sample Pearson correlation with ground truth +#' cor_values <- sapply(seq_len(nrow(results$proportions)), function(i) { +#' cor(results$proportions[i, ], true_props[i, ], method = "pearson") +#' }) +#' mean(cor_values, na.rm = TRUE) +#' +#' @source +#' Simulated using negative-binomial expression models with known mixing +#' proportions. See \code{data-raw/prepare_data.R} for the generation script. +"sampleData" diff --git a/deconvolve.R b/R/deconvolve.R similarity index 89% rename from deconvolve.R rename to R/deconvolve.R index a2eb369..aa69a34 100644 --- a/deconvolve.R +++ b/R/deconvolve.R @@ -28,22 +28,24 @@ #' } #' #' @examples -#' \dontrun{ -#' # Load example data -#' data(example_bulk) -#' data(example_reference) +#' # Load the built-in sample benchmark dataset +#' data(sampleData) #' -#' # Run deconvolution +#' # Run deconvolution on a small subset for speed +#' set.seed(42) +#' idx <- sample(ncol(sampleData$bulk), 10) #' results <- deconvolve( -#' bulk = example_bulk, -#' reference = example_reference, -#' n_cores = 4 +#' bulk = sampleData$bulk[, idx], +#' reference = sampleData$cellTypeExpr, +#' n_cores = 1, +#' maxit = 50 #' ) #' -#' # View proportions +#' # View estimated proportions (samples x cell types) #' head(results$proportions) -#' } #' +#' @importFrom parallel mclapply +#' @importFrom stats optim qnorm #' @export deconvolve <- function(bulk, reference, @@ -53,7 +55,6 @@ deconvolve <- function(bulk, maxit = 100, verbose = FALSE) { - # Input validation if (!is.matrix(bulk)) { bulk <- as.matrix(bulk) @@ -63,8 +64,7 @@ deconvolve <- function(bulk, } # Check for common genes - -common_genes <- intersect(rownames(bulk), rownames(reference)) + common_genes <- intersect(rownames(bulk), rownames(reference)) if (length(common_genes) == 0) { stop("No common genes found between bulk and reference matrices.") } @@ -110,7 +110,7 @@ common_genes <- intersect(rownames(bulk), rownames(reference)) # Apply spline-based normalization ranks <- rank(z) / length(z) - quantile_norm <- qnorm(pmax(0.001, pmin(0.999, ranks))) + quantile_norm <- stats::qnorm(pmax(0.001, pmin(0.999, ranks))) # Enhanced blending transformation with multi-level shrinkage signal_weight <- pmin(1, q / (10 + alpha * q)) * (1 - local_shrink) @@ -131,14 +131,9 @@ common_genes <- intersect(rownames(bulk), rownames(reference)) # Define loss function loss_function <- function(p, b, reference, alpha, power, transform_func) { - # Apply transformation to bulk b_transformed <- transform_func(b, alpha) - - # Compute fitted values fitted <- as.vector(reference %*% p) fitted_transformed <- transform_func(fitted, alpha) - - # Calculate loss return(sum(abs(b_transformed - fitted_transformed)^power)) } @@ -148,8 +143,8 @@ common_genes <- intersect(rownames(bulk), rownames(reference)) } results <- parallel::mclapply(seq_len(n_samples), mc.cores = n_cores, mc.preschedule = FALSE, function(i) { - result <- tryCatch({ - optim_result <- optim( + tryCatch({ + optim_result <- stats::optim( par = rep(1 / n_cell_types, n_cell_types), fn = loss_function, b = bulk[, i], @@ -162,7 +157,6 @@ common_genes <- intersect(rownames(bulk), rownames(reference)) upper = 100, control = list(maxit = maxit) ) - list( proportions = optim_result$par, convergence = optim_result$convergence, @@ -176,16 +170,17 @@ common_genes <- intersect(rownames(bulk), rownames(reference)) error = conditionMessage(e) ) }) - - return(result) }) # Extract proportions matrix proportions <- do.call(rbind, lapply(results, function(x) x$proportions)) + # Clamp: L-BFGS-B can return tiny negatives (~-1e-16) on some platforms + proportions <- pmax(proportions, 0) + # Normalize to sum to 1 row_sums <- rowSums(proportions) - row_sums[row_sums == 0] <- 1 # Avoid division by zero + row_sums[row_sums == 0] <- 1 proportions <- proportions / row_sums # Handle NAs diff --git a/R/package.R b/R/package.R new file mode 100644 index 0000000..30b4319 --- /dev/null +++ b/R/package.R @@ -0,0 +1,8 @@ +#' DeOPUS: Deconvolution via Optimized Power-transformed Unmixing with Shrinkage +#' +#' A reference-based cellular deconvolution method that employs hierarchical +#' shrinkage transformation to robustly estimate cell-type proportions from +#' bulk RNA sequencing data. +#' +#' @importFrom Matrix sparseMatrix +"_PACKAGE" diff --git a/README.md b/README.md index f3b4709..6104e30 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,16 @@ # DeOPUS: Deconvolution via Optimized Power-transformed Unmixing with Shrinkage -[![R](https://img.shields.io/badge/R-%3E%3D4.0-blue.svg)](https://www.r-project.org/) +[![R-CMD-check](https://github.com/tinnlab/DeOPUS/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/tinnlab/DeOPUS/actions/workflows/R-CMD-check.yaml) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -DeOPUS is a reference-based cellular deconvolution method that employs hierarchical shrinkage transformation to robustly estimate cell-type proportions from bulk RNA sequencing data. +DeOPUS is a reference-based cellular deconvolution method that employs hierarchical +shrinkage transformation to robustly estimate cell-type proportions from bulk RNA-seq data. ## Overview -Single-cell RNA sequencing enables comprehensive transcriptomic profiling at single-cell resolution, but high costs limit its widespread application. DeOPUS offers a cost-effective alternative by computationally estimating cell-type proportions from bulk RNA-seq data. +Single-cell RNA sequencing enables comprehensive transcriptomic profiling at single-cell +resolution, but high costs limit its widespread application. DeOPUS offers a cost-effective +alternative by computationally estimating cell-type proportions from bulk RNA-seq data. **Key features:** - Hierarchical shrinkage transformation with local and global priors @@ -27,40 +30,74 @@ install.packages("devtools") devtools::install_github("tinnlab/DeOPUS") ``` -### Dependencies +### Optional dependencies (only for the benchmark / figure scripts) + +The core `deconvolve()` function needs nothing beyond what `install_github` +installs. The reproducibility scripts in `inst/scripts/` have additional needs: ```r -install.packages(c("Matrix", "dplyr", "parallel")) +# Benchmark runner +install.packages(c("RhpcBLASctl", "dplyr")) + +# Figure generation +install.packages(c("tidyverse", "scales", "cowplot", "gridExtra")) + +# Bioconductor (used by visualize_results.R) +if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") +BiocManager::install(c("ComplexHeatmap", "circlize")) + +# OPTIONAL — only for competing methods (MuSiC / AutoGeneS / CIBERSORT / +# FARDEEP / scaden / AdRoit). Requires Docker installed and running. +# install.packages("DeconBenchmark") ``` ## Quick Start +The package ships with a built-in simulated benchmark dataset (`sampleData`): +11,852 genes × 512 samples with known ground-truth proportions for 2 cell types. + ```r library(DeOPUS) -# Load your data -# cellTypeExpr: reference expression matrix (genes x cell types) -# bulk: bulk expression matrix (genes x samples) +# Load the built-in sample benchmark dataset +data(sampleData) + +# Run deconvolution on a small subset for speed +set.seed(42) +idx <- sample(ncol(sampleData$bulk), 10) -# Run deconvolution results <- deconvolve( - bulk = bulk_matrix, - reference = reference_matrix, - alpha = 0.01, - n_cores = 4 + bulk = sampleData$bulk[, idx], + reference = sampleData$cellTypeExpr, + alpha = 0.01, + n_cores = 1, + verbose = TRUE ) -# View estimated proportions +# View estimated proportions (samples x cell types) head(results$proportions) + +# Evaluate against ground truth +# IMPORTANT: bulkRatio rows and cellTypeExpr columns may be in different orders, +# so align by name before computing correlations. +ct <- colnames(results$proportions) +true_props <- t(sampleData$bulkRatio[ct, idx, drop = FALSE]) # 10 x 2 aligned + +cor_values <- sapply(seq_len(nrow(results$proportions)), function(i) { + cor(results$proportions[i, ], true_props[i, ], method = "pearson") +}) +cat("Mean Pearson correlation:", round(mean(cor_values, na.rm = TRUE), 3), "\n") ``` ## Input Data Format DeOPUS requires two main inputs: -1. **Reference expression matrix** (`reference`): A genes × cell types matrix containing average expression profiles for each cell type, typically derived from scRNA-seq data. - -2. **Bulk expression matrix** (`bulk`): A genes × samples matrix containing bulk RNA-seq expression data to be deconvolved. +1. **Reference expression matrix** (`reference`): A genes × cell types matrix + containing average expression profiles for each cell type, typically derived + from scRNA-seq data. +2. **Bulk expression matrix** (`bulk`): A genes × samples matrix containing bulk + RNA-seq expression data to be deconvolved. Both matrices should: - Have matching gene identifiers (rownames) @@ -76,7 +113,8 @@ DeOPUS applies a multi-level adaptive transformation: 3. **Power transformation**: Stabilizes variance across the dynamic range 4. **Quantile normalization**: Ensures robust comparison between predicted and observed profiles -The optimization minimizes the weighted loss between transformed bulk and reconstructed expression profiles using L-BFGS-B with box constraints. +The optimization minimizes the weighted loss between transformed bulk and reconstructed +expression profiles using L-BFGS-B with box constraints. ## Benchmarking @@ -90,57 +128,94 @@ We benchmarked DeOPUS against six state-of-the-art methods: ### Running Benchmarks -```r -# Run benchmark on simulated data -source("scripts/benchmark/run_benchmark_simulated.R") +Benchmark and visualization scripts ship in `inst/scripts/` and are accessible +after install via `system.file()`. The benchmark expects a directory of `.rds` +input files where each file is a list matching the schema of `data(sampleData)` +(`$bulk`, `$cellTypeExpr`, `$bulkRatio`, etc.). + +To benchmark **DeOPUS** on the bundled `sampleData` end-to-end: -# Run benchmark on real data -source("scripts/benchmark/run_benchmark_real.R") +```r +library(DeOPUS) +source(system.file("scripts/benchmark/run_benchmark_real.R", package = "DeOPUS")) + +# Prepare an input dir with at least one dataset .rds file +input_dir <- file.path(tempdir(), "decopus_inputs") +output_dir <- file.path(tempdir(), "decopus_results") +dir.create(input_dir, recursive = TRUE, showWarnings = FALSE) +dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) + +# Use the bundled sampleData as the benchmark input +data(sampleData) +saveRDS(sampleData, file.path(input_dir, "sampleData.rds")) + +# Run benchmark — restrict to DeOPUS unless you have DeconBenchmark + Docker +run_benchmark( + data_dir = input_dir, + output_dir = output_dir, + methods = "DeOPUS" +) -# Generate visualization dashboard -source("scripts/analysis/visualize_results.R") -results <- create_summary_barplot_dashboard(benchmark_data) +# Aggregate per-sample, per-cell-type results into a long-format dataframe +benchmark_data <- aggregate_results(output_dir = output_dir, methods = "DeOPUS") +head(benchmark_data) ``` +To include MuSiC, AutoGeneS, CIBERSORT, FARDEEP, scaden, AdRoit, install the +optional `DeconBenchmark` package (requires Docker) and pass them in +`methods = c("DeOPUS", "MuSiC", ...)`. + ## Reproducing Paper Results -To reproduce the results from our paper: +The benchmark scripts expect a directory of `.rds` datasets (one per real +benchmark dataset, matching the schema of `data(sampleData)`) at +`data/real_datasets/`, and write results to `results/real_benchmark/` and +figures to `figures/`. ```bash -# Clone the repository git clone https://github.com/tinnlab/DeOPUS.git cd DeOPUS -# Run the complete benchmark pipeline -Rscript scripts/benchmark/run_all_benchmarks.R +# Place benchmark datasets here (one .rds per dataset) +mkdir -p data/real_datasets +# cp /path/to/your/*.rds data/real_datasets/ + +# Run benchmark pipeline (defaults: data/real_datasets/ → results/real_benchmark/) +Rscript inst/scripts/benchmark/run_benchmark_real.R -# Generate figures -Rscript scripts/analysis/generate_figures.R +# Generate figures (written to figures/) +Rscript inst/scripts/analysis/generate_figures.R ``` ### Data Availability -Benchmark datasets are available at https://doi.org/10.5281/zenodo.19050845 or can be generated using: +Benchmark datasets are available at +https://doi.org/10.5281/zenodo.19050845. The simulation generator and a +template for preparing real GEO datasets are in +`inst/scripts/data/prepare_data.R` — source the file, then call +`create_simulated_dataset()` or adapt `prepare_geo_dataset()` to your data: ```r -source("scripts/data/prepare_benchmark_data.R") +source(system.file("scripts/data/prepare_data.R", package = "DeOPUS")) +# sampleData_like <- create_simulated_dataset(n_samples = 100, n_cell_types = 5) ``` ## Output -DeOPUS returns a list containing: +`deconvolve()` returns a list containing: -- `proportions`: Matrix of estimated cell-type proportions (samples × cell types) -- `convergence`: Optimization convergence information for each sample +- `proportions`: Matrix of estimated cell-type proportions (samples × cell types). Each row sums to 1. +- `convergence`: Named list of per-sample optimization convergence information (one entry per sample, holding `convergence`, `value`, and optionally an `error` message). ## Parameters | Parameter | Default | Description | |-----------|---------|-------------| -| `alpha` | 0.01 | Regularization parameter for transformation | -| `n_cores` | 1 | Number of parallel cores | -| `maxit` | 100 | Maximum optimization iterations | -| `power` | 2 | Loss function power (1=MAE, 2=MSE) | +| `alpha` | 0.01 | Regularization parameter for transformation | +| `power` | 2 | Loss function power (1 = MAE, 2 = MSE) | +| `n_cores` | 1 | Number of parallel cores | +| `maxit` | 100 | Maximum optimization iterations | +| `verbose` | FALSE | Print progress messages | ## Citation @@ -148,21 +223,12 @@ If you use DeOPUS in your research, please cite: ```bibtex @article{DeOPUS2025, - title={DeOPUS: Deconvolution via Optimized Power-transformed Unmixing with Shrinkage}, - author={}, - journal={}, - year={2025}, - doi={} + title = {DeOPUS: Deconvolution via Optimized Power-transformed Unmixing with Shrinkage}, + author = {Ha Nguyen}, + year = {2025} } ``` -[//]: # (## License) - -[//]: # () -[//]: # (This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.) - -[//]: # () -[//]: # (## Contact) +## License -[//]: # () -[//]: # (For questions or issues, please open an issue on GitHub or contact [email].) +MIT © Ha Nguyen — see [LICENSE.md](LICENSE.md) for full text. diff --git a/data-raw/prepare_data.R b/data-raw/prepare_data.R new file mode 100644 index 0000000..514f30a --- /dev/null +++ b/data-raw/prepare_data.R @@ -0,0 +1,17 @@ +## Script that generated data/sampleData.rda +## +## Run this script from the package root to regenerate the package dataset +## from the original source file. + +# ── sampleData ──────────────────────────────────────────────────────────────── +# +# The source file was a pre-generated simulation containing: +# 11852 genes x 512 simulated bulk RNA-seq samples +# 2 cell types (mature NK T cell, mesenchymal stem cell) +# Matching ground-truth proportions (bulkRatio) + +# Load original source (adjust path as needed) +sampleData <- readRDS("path/to/source_simulation.rds") + +# Save as compressed .rda for use as package data +usethis::use_data(sampleData, overwrite = TRUE, compress = "xz") diff --git a/data/sampleData.rda b/data/sampleData.rda new file mode 100644 index 0000000..52350bb Binary files /dev/null and b/data/sampleData.rda differ diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..bcee21d --- /dev/null +++ b/environment.yml @@ -0,0 +1,13 @@ +name: decopus +channels: + - conda-forge + - defaults +dependencies: + - r-base>=4.3 + - r-matrix + - r-devtools + - r-testthat + - r-roxygen2 + - r-knitr + - r-rmarkdown + - r-ggplot2 diff --git a/DeOPUS/scripts/analysis/generate_figures.R b/inst/scripts/analysis/generate_figures.R similarity index 95% rename from DeOPUS/scripts/analysis/generate_figures.R rename to inst/scripts/analysis/generate_figures.R index 932fecf..d9fc469 100644 --- a/DeOPUS/scripts/analysis/generate_figures.R +++ b/inst/scripts/analysis/generate_figures.R @@ -7,8 +7,13 @@ library(tidyverse) library(scales) library(cowplot) -# Source visualization functions -source("scripts/analysis/visualize_results.R") +# Source visualization functions. Prefer the installed-package location +# (system.file resolves after install_github / R CMD INSTALL); fall back to +# the in-repo path when running from a clone. +.viz_path <- system.file("scripts/analysis/visualize_results.R", + package = "DeOPUS") +if (!nzchar(.viz_path)) .viz_path <- "inst/scripts/analysis/visualize_results.R" +source(.viz_path) ################################################################################ # Configuration diff --git a/DeOPUS/scripts/analysis/visualize_results.R b/inst/scripts/analysis/visualize_results.R similarity index 100% rename from DeOPUS/scripts/analysis/visualize_results.R rename to inst/scripts/analysis/visualize_results.R diff --git a/DeOPUS/scripts/benchmark/run_benchmark_real.R b/inst/scripts/benchmark/run_benchmark_real.R similarity index 100% rename from DeOPUS/scripts/benchmark/run_benchmark_real.R rename to inst/scripts/benchmark/run_benchmark_real.R diff --git a/DeOPUS/scripts/data/prepare_data.R b/inst/scripts/data/prepare_data.R similarity index 100% rename from DeOPUS/scripts/data/prepare_data.R rename to inst/scripts/data/prepare_data.R diff --git a/man/DeOPUS-package.Rd b/man/DeOPUS-package.Rd new file mode 100644 index 0000000..7a5de89 --- /dev/null +++ b/man/DeOPUS-package.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/package.R +\docType{package} +\name{DeOPUS-package} +\alias{DeOPUS} +\alias{DeOPUS-package} +\title{DeOPUS: Deconvolution via Optimized Power-transformed Unmixing with Shrinkage} +\description{ +A reference-based cellular deconvolution method that employs hierarchical +shrinkage transformation to robustly estimate cell-type proportions from +bulk RNA sequencing data. +} +\seealso{ +Useful links: +\itemize{ + \item \url{https://github.com/tinnlab/DeOPUS} + \item Report bugs at \url{https://github.com/tinnlab/DeOPUS/issues} +} + +} +\author{ +\strong{Maintainer}: Ha Nguyen \email{hvn0006@wayne.edu} + +} diff --git a/man/deconvolve.Rd b/man/deconvolve.Rd new file mode 100644 index 0000000..261005e --- /dev/null +++ b/man/deconvolve.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/deconvolve.R +\name{deconvolve} +\alias{deconvolve} +\title{Deconvolve bulk RNA-seq data using DeOPUS} +\usage{ +deconvolve( + bulk, + reference, + alpha = 0.01, + power = 2, + n_cores = 1, + maxit = 100, + verbose = FALSE +) +} +\arguments{ +\item{bulk}{A numeric matrix of bulk expression data (genes x samples).} + +\item{reference}{A numeric matrix of reference expression profiles (genes x cell types).} + +\item{alpha}{Numeric. Regularization parameter for transformation. Default is 0.01.} + +\item{power}{Numeric. Power for loss function (1 = MAE, 2 = MSE). Default is 2.} + +\item{n_cores}{Integer. Number of cores for parallel processing. Default is 1.} + +\item{maxit}{Integer. Maximum number of optimization iterations. Default is 100.} + +\item{verbose}{Logical. Whether to print progress messages. Default is FALSE.} +} +\value{ +A list containing: +\itemize{ + \item \code{proportions}: Matrix of estimated cell-type proportions (samples x cell types) + \item \code{convergence}: List of convergence information for each sample +} +} +\description{ +Estimates cell-type proportions from bulk RNA-seq data using a reference-based +approach with hierarchical shrinkage transformation. +} +\details{ +DeOPUS applies a multi-level adaptive transformation combining: +\itemize{ + \item Local shrinkage: Attenuates influence of high-variance genes + \item Global shrinkage: Provides overall regularization + \item Power transformation: Stabilizes variance + \item Quantile normalization: Ensures robust profile comparison +} +} +\examples{ +# Load the built-in sample benchmark dataset +data(sampleData) + +# Run deconvolution on a small subset for speed +set.seed(42) +idx <- sample(ncol(sampleData$bulk), 10) +results <- deconvolve( + bulk = sampleData$bulk[, idx], + reference = sampleData$cellTypeExpr, + n_cores = 1, + maxit = 50 +) + +# View estimated proportions (samples x cell types) +head(results$proportions) + +} diff --git a/man/sampleData.Rd b/man/sampleData.Rd new file mode 100644 index 0000000..81ed08f --- /dev/null +++ b/man/sampleData.Rd @@ -0,0 +1,75 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{sampleData} +\alias{sampleData} +\title{Sample bulk RNA-seq deconvolution benchmark dataset} +\format{ +A list with the following components: +\describe{ + \item{bulk}{Numeric matrix (11852 genes x 512 samples) of simulated bulk + RNA-seq expression values in linear scale. Row names are Ensembl gene IDs + (e.g. ENSG00000000419); column names are Mixture1 through Mixture512.} + \item{bulkRatio}{Numeric matrix (2 cell types x 512 samples) containing + the ground-truth cell-type proportions used to generate \code{bulk}. + Row names are \code{"mature NK T cell"} and \code{"mesenchymal stem cell"}. + Note: row order may differ from the column order of \code{cellTypeExpr}, + so align by name before comparing predictions to ground truth.} + \item{cellTypeExpr}{Numeric matrix (11852 genes x 2 cell types) of reference + expression profiles derived from single-cell RNA-seq data. + Column names match (but may be in a different order from) the row names of + \code{bulkRatio}.} + \item{signature}{Numeric matrix (974 signature genes x 2 cell types) of + the most discriminative marker genes for each cell type.} + \item{nCellTypes}{Integer. Number of cell types (2).} + \item{markers}{Named list of marker gene vectors, one entry per cell type.} + \item{sigGenes}{Character vector of signature gene names.} + \item{singleCellExpr}{Sparse matrix (dgCMatrix) of the single-cell RNA-seq + expression data used to derive the reference profiles.} + \item{singleCellLabels}{Character vector of cell-type labels for each cell + in \code{singleCellExpr}.} + \item{singleCellSubjects}{Character vector of donor/subject IDs for each + cell in \code{singleCellExpr}.} +} +} +\source{ +Simulated using negative-binomial expression models with known mixing +proportions. See \code{data-raw/prepare_data.R} for the generation script. +} +\usage{ +data(sampleData) +} +\description{ +A simulated benchmark dataset with known ground-truth cell-type proportions +for evaluating and demonstrating the DeOPUS deconvolution method. +} +\examples{ +data(sampleData) + +# Inspect the dataset +dim(sampleData$bulk) # 11852 x 512 +dim(sampleData$cellTypeExpr) # 11852 x 2 + +# Run deconvolution on a small subset for speed +set.seed(42) +idx <- sample(ncol(sampleData$bulk), 10) +results <- deconvolve( + bulk = sampleData$bulk[, idx], + reference = sampleData$cellTypeExpr, + n_cores = 1, + maxit = 50 +) + +# Align bulkRatio cell-type ORDER to match results$proportions columns +# (bulkRatio rows and cellTypeExpr columns may be in a different order). +ct <- colnames(results$proportions) +true_props <- t(sampleData$bulkRatio[ct, idx, drop = FALSE]) + +# Per-sample Pearson correlation with ground truth +cor_values <- sapply(seq_len(nrow(results$proportions)), function(i) { + cor(results$proportions[i, ], true_props[i, ], method = "pearson") +}) +mean(cor_values, na.rm = TRUE) + +} +\keyword{datasets} diff --git a/DeOPUS/tests/testthat.R b/tests/testthat.R similarity index 100% rename from DeOPUS/tests/testthat.R rename to tests/testthat.R diff --git a/tests/testthat/test-data.R b/tests/testthat/test-data.R new file mode 100644 index 0000000..75f419b --- /dev/null +++ b/tests/testthat/test-data.R @@ -0,0 +1,110 @@ +library(DeOPUS) + +# ── sampleData structure ────────────────────────────────────────────────────── + +test_that("sampleData loads and is a list", { + data(sampleData) + expect_true(is.list(sampleData)) +}) + +test_that("sampleData has required components", { + data(sampleData) + expect_true(all(c("bulk", "bulkRatio", "cellTypeExpr", "signature", + "singleCellLabels", "singleCellSubjects") %in% names(sampleData))) +}) + +test_that("sampleData$bulk is a numeric matrix with named dimensions", { + data(sampleData) + expect_true(is.matrix(sampleData$bulk)) + expect_true(is.numeric(sampleData$bulk)) + expect_false(is.null(rownames(sampleData$bulk))) + expect_false(is.null(colnames(sampleData$bulk))) +}) + +test_that("sampleData$bulk has expected dimensions (11852 genes x 512 samples)", { + data(sampleData) + expect_equal(nrow(sampleData$bulk), 11852) + expect_equal(ncol(sampleData$bulk), 512) +}) + +test_that("sampleData$cellTypeExpr has expected dimensions (11852 genes x 2 cell types)", { + data(sampleData) + expect_equal(nrow(sampleData$cellTypeExpr), 11852) + expect_equal(ncol(sampleData$cellTypeExpr), 2) +}) + +test_that("sampleData$bulk and $cellTypeExpr share gene names", { + data(sampleData) + common <- intersect(rownames(sampleData$bulk), rownames(sampleData$cellTypeExpr)) + expect_gt(length(common), 0) +}) + +test_that("sampleData$bulkRatio proportions are between 0 and 1", { + data(sampleData) + expect_true(all(sampleData$bulkRatio >= 0)) + expect_true(all(sampleData$bulkRatio <= 1)) +}) + +test_that("sampleData$bulkRatio column sums are approximately 1", { + data(sampleData) + col_sums <- colSums(sampleData$bulkRatio) + expect_true(all(abs(col_sums - 1) < 1e-6)) +}) + +test_that("sampleData bulk values are non-negative", { + data(sampleData) + expect_true(all(sampleData$bulk >= 0)) +}) + +test_that("sampleData bulkRatio rows and cellTypeExpr columns refer to the same cell types", { + data(sampleData) + # Same set of names, possibly in different order. Aligning by name is therefore + # always safe (and necessary, since the order does differ in this dataset). + expect_setequal(rownames(sampleData$bulkRatio), colnames(sampleData$cellTypeExpr)) +}) + +# ── Integration test: deconvolve on a subset of sampleData ─────────────────── +# +# This test verifies the *correlation code logic* — alignment by cell-type name, +# finite outputs, positive sign. It does NOT assert a specific accuracy +# threshold, because the deconvolution algorithm may legitimately change in the +# future. The > 0 assertion exists to catch the misalignment bug where unaligned +# columns produced cor = -1 instead of +1. + +test_that("deconvolve runs on sampleData and produces correctly-aligned correlations", { + data(sampleData) + + set.seed(42) + idx <- sample(ncol(sampleData$bulk), 10) + + results <- deconvolve( + bulk = sampleData$bulk[, idx], + reference = sampleData$cellTypeExpr, + n_cores = 1, + maxit = 50 + ) + + # Output shape + expect_equal(nrow(results$proportions), 10) + expect_equal(ncol(results$proportions), 2) + expect_true(all(results$proportions >= 0)) + expect_true(all(abs(rowSums(results$proportions) - 1) < 1e-6)) + + # Align bulkRatio cell-type ORDER to match results$proportions columns + ct <- colnames(results$proportions) + true_props <- t(sampleData$bulkRatio[ct, idx, drop = FALSE]) + + cor_values <- sapply(seq_len(nrow(results$proportions)), function(i) { + cor(results$proportions[i, ], true_props[i, ], method = "pearson") + }) + + # One correlation per sample + expect_length(cor_values, 10) + + # All finite (catches NA/NaN propagation in the correlation code) + expect_true(all(is.finite(cor_values))) + + # Sign check: catches the misalignment bug. Naive cor() on reversed cell-type + # order would give mean = -1 instead of a positive number. + expect_gt(mean(cor_values), 0) +}) diff --git a/tests/testthat/test-deconvolve.R b/tests/testthat/test-deconvolve.R new file mode 100644 index 0000000..4c9fad6 --- /dev/null +++ b/tests/testthat/test-deconvolve.R @@ -0,0 +1,217 @@ +library(DeOPUS) + +# ── Helpers ──────────────────────────────────────────────────────────────────── + +make_data <- function(n_genes = 100, n_cell_types = 3, n_samples = 5, seed = 42) { + set.seed(seed) + reference <- matrix( + rpois(n_genes * n_cell_types, lambda = 100), + nrow = n_genes, ncol = n_cell_types + ) + rownames(reference) <- paste0("Gene", seq_len(n_genes)) + colnames(reference) <- paste0("CellType", seq_len(n_cell_types)) + + true_props <- matrix( + runif(n_samples * n_cell_types), + nrow = n_samples, ncol = n_cell_types + ) + true_props <- true_props / rowSums(true_props) + + bulk <- reference %*% t(true_props) + + matrix(rnorm(n_genes * n_samples, sd = 5), nrow = n_genes) + bulk[bulk < 0] <- 0 + colnames(bulk) <- paste0("Sample", seq_len(n_samples)) + + list(bulk = bulk, reference = reference, true_props = true_props) +} + +# ── Core functionality ───────────────────────────────────────────────────────── + +test_that("deconvolve returns list with proportions and convergence", { + d <- make_data() + res <- deconvolve(d$bulk, d$reference, n_cores = 1, maxit = 50) + + expect_true(is.list(res)) + expect_named(res, c("proportions", "convergence")) +}) + +test_that("proportions matrix has correct dimensions", { + d <- make_data(n_genes = 100, n_cell_types = 3, n_samples = 5) + res <- deconvolve(d$bulk, d$reference, n_cores = 1, maxit = 50) + + expect_equal(nrow(res$proportions), 5) + expect_equal(ncol(res$proportions), 3) +}) + +test_that("proportions sum to 1 for every sample", { + d <- make_data() + res <- deconvolve(d$bulk, d$reference, n_cores = 1, maxit = 50) + + row_sums <- rowSums(res$proportions) + expect_true(all(abs(row_sums - 1) < 1e-6)) +}) + +test_that("proportions are non-negative", { + d <- make_data() + res <- deconvolve(d$bulk, d$reference, n_cores = 1, maxit = 50) + + expect_true(all(res$proportions >= 0)) +}) + +test_that("output row names match bulk column names", { + d <- make_data() + res <- deconvolve(d$bulk, d$reference, n_cores = 1, maxit = 50) + + expect_equal(rownames(res$proportions), colnames(d$bulk)) +}) + +test_that("output column names match reference column names", { + d <- make_data() + res <- deconvolve(d$bulk, d$reference, n_cores = 1, maxit = 50) + + expect_equal(colnames(res$proportions), colnames(d$reference)) +}) + +test_that("convergence list has one entry per sample", { + d <- make_data(n_samples = 4) + res <- deconvolve(d$bulk, d$reference, n_cores = 1, maxit = 50) + + expect_length(res$convergence, 4) + expect_equal(names(res$convergence), colnames(d$bulk)) +}) + +# ── Parameter variants ───────────────────────────────────────────────────────── + +test_that("deconvolve works with power = 1 (MAE loss)", { + d <- make_data() + res <- deconvolve(d$bulk, d$reference, power = 1, n_cores = 1, maxit = 50) + + expect_true(all(res$proportions >= 0)) + expect_true(all(abs(rowSums(res$proportions) - 1) < 1e-6)) +}) + +test_that("deconvolve works with maxit = 1 (minimal iterations)", { + d <- make_data() + expect_no_error(deconvolve(d$bulk, d$reference, n_cores = 1, maxit = 1)) +}) + +test_that("deconvolve works with custom alpha", { + d <- make_data() + res <- deconvolve(d$bulk, d$reference, alpha = 0.1, n_cores = 1, maxit = 50) + + expect_true(all(res$proportions >= 0)) +}) + +test_that("verbose = TRUE emits messages without error", { + d <- make_data() + expect_no_error( + expect_message( + deconvolve(d$bulk, d$reference, verbose = TRUE, n_cores = 1, maxit = 5) + ) + ) +}) + +# ── Input coercion ───────────────────────────────────────────────────────────── + +test_that("deconvolve accepts data.frame inputs and coerces to matrix", { + d <- make_data() + bulk_df <- as.data.frame(d$bulk) + ref_df <- as.data.frame(d$reference) + + res <- deconvolve(bulk_df, ref_df, n_cores = 1, maxit = 50) + + expect_true(is.matrix(res$proportions)) + expect_equal(nrow(res$proportions), ncol(d$bulk)) +}) + +# ── Gene overlap handling ────────────────────────────────────────────────────── + +test_that("deconvolve handles partial gene overlap", { + set.seed(1) + reference <- matrix(rpois(100 * 3, lambda = 100), nrow = 100, ncol = 3) + rownames(reference) <- paste0("Gene", 1:100) + colnames(reference) <- paste0("CellType", 1:3) + + bulk <- matrix(rpois(100 * 2, lambda = 100), nrow = 100, ncol = 2) + rownames(bulk) <- paste0("Gene", 50:149) + colnames(bulk) <- paste0("Sample", 1:2) + + res <- deconvolve(bulk, reference, n_cores = 1, maxit = 50) + + expect_equal(nrow(res$proportions), 2) + expect_equal(ncol(res$proportions), 3) +}) + +test_that("deconvolve errors when no genes overlap", { + reference <- matrix(1:9, nrow = 3, ncol = 3) + rownames(reference) <- c("GeneA", "GeneB", "GeneC") + colnames(reference) <- paste0("CellType", 1:3) + + bulk <- matrix(1:6, nrow = 3, ncol = 2) + rownames(bulk) <- c("GeneX", "GeneY", "GeneZ") + colnames(bulk) <- paste0("Sample", 1:2) + + expect_error(deconvolve(bulk, reference), "No common genes") +}) + +# ── Edge cases ───────────────────────────────────────────────────────────────── + +test_that("deconvolve works with a single sample", { + d <- make_data(n_samples = 1) + res <- deconvolve(d$bulk, d$reference, n_cores = 1, maxit = 50) + + expect_equal(nrow(res$proportions), 1) + expect_true(abs(sum(res$proportions) - 1) < 1e-6) +}) + +test_that("deconvolve works with a single cell type", { + set.seed(7) + ref <- matrix(rpois(50, lambda = 100), nrow = 50, ncol = 1) + rownames(ref) <- paste0("Gene", 1:50) + colnames(ref) <- "CellType1" + + bulk <- matrix(rpois(50 * 3, lambda = 100), nrow = 50, ncol = 3) + rownames(bulk) <- paste0("Gene", 1:50) + colnames(bulk) <- paste0("Sample", 1:3) + + res <- deconvolve(bulk, ref, n_cores = 1, maxit = 50) + + expect_equal(ncol(res$proportions), 1) + expect_true(all(res$proportions >= 0)) + # With one cell type, proportions are either 1 (converged) or 0 (failed) + expect_true(all(res$proportions %in% c(0, 1) | abs(res$proportions - 1) < 1e-6)) +}) + +test_that("proportions remain non-negative under heavy noise", { + set.seed(99) + n_genes <- 200 + ref <- matrix(rpois(n_genes * 4, lambda = 200), nrow = n_genes, ncol = 4) + rownames(ref) <- paste0("Gene", seq_len(n_genes)) + colnames(ref) <- paste0("CT", 1:4) + + bulk <- matrix(abs(rnorm(n_genes * 10, mean = 100, sd = 500)), nrow = n_genes) + rownames(bulk) <- paste0("Gene", seq_len(n_genes)) + colnames(bulk) <- paste0("S", 1:10) + + res <- deconvolve(bulk, ref, n_cores = 1, maxit = 30) + + # Non-negativity must always hold + expect_true(all(res$proportions >= 0)) + # Row sums are either 1 (converged) or 0 (optimization failed under extreme noise) + row_sums <- rowSums(res$proportions) + expect_true(all(abs(row_sums - 1) < 1e-6 | row_sums == 0)) +}) + +# ── Internal utility ─────────────────────────────────────────────────────────── + +test_that("%||% returns left value when non-NULL", { + f <- DeOPUS:::`%||%` + expect_equal(f("a", "b"), "a") + expect_equal(f(42L, 0L), 42L) +}) + +test_that("%||% returns right value when left is NULL", { + f <- DeOPUS:::`%||%` + expect_equal(f(NULL, "default"), "default") + expect_null(f(NULL, NULL)) +}) diff --git a/vignettes/getting_started.Rmd b/vignettes/getting_started.Rmd new file mode 100644 index 0000000..07590e8 --- /dev/null +++ b/vignettes/getting_started.Rmd @@ -0,0 +1,159 @@ +--- +title: "Getting Started with DeOPUS" +author: "Ha Nguyen" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Getting Started with DeOPUS} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.width = 7, + fig.height = 5 +) +``` + +## Introduction + +DeOPUS (Deconvolution via Optimized Power-transformed Unmixing with Shrinkage) +is a reference-based cellular deconvolution method that estimates cell-type +proportions from bulk RNA sequencing data. + +## Installation + +Install DeOPUS from GitHub: + +```{r install, eval=FALSE} +# Install devtools if not already installed +install.packages("devtools") + +# Install DeOPUS +devtools::install_github("tinnlab/DeOPUS") +``` + +Load the package: + +```{r load} +library(DeOPUS) +``` + +## Input Data Format + +DeOPUS requires two inputs: + +1. **Bulk matrix** (`bulk`): A genes × samples matrix of bulk RNA-seq expression + values in linear scale (non-negative, not log-transformed). +2. **Reference matrix** (`reference`): A genes × cell types matrix of average + expression profiles per cell type, typically derived from scRNA-seq. + +Both matrices must have matching gene identifiers as row names. + +## Quick Example + +The package includes a built-in simulated benchmark dataset (`sampleData`) with +known ground-truth proportions for 2 cell types across 512 samples. + +```{r sample-data, eval=FALSE} +# Load the built-in sample dataset +data(sampleData) + +# Inspect dimensions +dim(sampleData$bulk) # 11852 genes x 512 samples +dim(sampleData$cellTypeExpr) # 11852 genes x 2 cell types + +# Run deconvolution on a small subset (10 samples for speed) +set.seed(42) +idx <- sample(ncol(sampleData$bulk), 10) + +results <- deconvolve( + bulk = sampleData$bulk[, idx], + reference = sampleData$cellTypeExpr, + alpha = 0.01, + n_cores = 1, + verbose = TRUE +) + +# View estimated proportions (samples x cell types) +head(results$proportions) +``` + +### Evaluate Against Ground Truth + +`bulkRatio` row names and `cellTypeExpr` column names may be in different +orders, so align by cell-type name before computing the correlation: + +```{r evaluate, eval=FALSE} +# Align bulkRatio cell-type ORDER to match results$proportions columns +ct <- colnames(results$proportions) +true_props <- t(sampleData$bulkRatio[ct, idx, drop = FALSE]) # 10 x 2 + +# Pearson correlation per sample +cor_values <- sapply(seq_len(nrow(results$proportions)), function(i) { + cor(results$proportions[i, ], true_props[i, ], method = "pearson") +}) + +cat("Mean Pearson correlation:", round(mean(cor_values, na.rm = TRUE), 3), "\n") +``` + +## Method Details + +DeOPUS applies a multi-level adaptive transformation: + +### 1. Hierarchical Shrinkage + +- **Local shrinkage** ($\tau$): Attenuates the influence of individual + high-variance genes. +- **Global shrinkage** ($\phi$): Provides overall regularization across the + expression profile. + +### 2. Power Transformation + +A variance-stabilizing power transformation handles the high dynamic range: + +$$z = \frac{(q/s + \kappa)^{\mu \omega g} - \kappa^\mu}{\mu g}$$ + +where $q$ is the expression value, $s$ is a scale factor, $g$ is the global +shrinkage weight, and $\kappa$, $\mu$, $\omega$ are shape parameters. + +### 3. Quantile Normalization + +Spline-based quantile normalization ensures robust comparison between profiles. + +### 4. Optimization + +DeOPUS uses L-BFGS-B optimization with box constraints (proportions ≥ 0) to +minimize the loss between transformed bulk and reconstructed expression. + +## Parameters + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `alpha` | 0.01 | Regularization parameter | +| `power` | 2 | Loss function power (1 = MAE, 2 = MSE) | +| `n_cores` | 1 | Parallel processing cores | +| `maxit` | 100 | Maximum optimization iterations | +| `verbose` | FALSE | Print progress messages | + +## Benchmarking + +The package ships benchmark and analysis scripts in `inst/scripts/`. After +installation, source them via `system.file()`: + +```{r benchmark, eval=FALSE} +# Run benchmark on real datasets +source(system.file("scripts/benchmark/run_benchmark_real.R", package = "DeOPUS")) + +# Generate visualization dashboard +source(system.file("scripts/analysis/visualize_results.R", package = "DeOPUS")) +``` + +## Session Info + +```{r sessioninfo} +sessionInfo() +```