diff --git a/DESCRIPTION b/DESCRIPTION index ad098aa..8babaff 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -60,6 +60,7 @@ Imports: codetools Suggests: multcomp, + parameters, gridExtra, knitr, BiocStyle, diff --git a/R/msqrob-utils.R b/R/msqrob-utils.R index 9449e32..cdd119d 100644 --- a/R/msqrob-utils.R +++ b/R/msqrob-utils.R @@ -668,3 +668,27 @@ makeContrast <- function(contrasts, parameterNames) { ex, " to character" ) } + +# Match Quantitative Columns Order for msqrobLmxxx() methods +.matchQuantColsOrder <- function(colsMetadata, data) { + if (!"quantCols" %in% names(colsMetadata)) { + warning("colsMetadata does not contain a 'quantCols' column. Skipping reordering.") + return(colsMetadata) + } + + data_cols <- colnames(data) + quant_cols_values <- colsMetadata$quantCols + + missing_cols <- setdiff(data_cols, quant_cols_values) + if (length(missing_cols) > 0) { + stop( + "Column(s) '", paste(missing_cols, collapse = "', '"), + "' from data not found in colsMetadata$quantCols." + ) + } + + match_indices <- match(data_cols, quant_cols_values) + reordered_metadata <- colsMetadata[match_indices, , drop = FALSE] + + return(reordered_metadata) +} diff --git a/R/msqrob.R b/R/msqrob.R index 2247aa7..a15f294 100644 --- a/R/msqrob.R +++ b/R/msqrob.R @@ -56,10 +56,12 @@ msqrobLm <- function(y, data, robust = TRUE, maxitRob = 5) { + data <- .matchQuantColsOrder(data, y) myDesign <- model.matrix(formula, data) + # apply the model to each protein models <- apply(y, 1, function(y, design) { - ## computatability check + ## computability check obs <- is.finite(y) type <- "fitError" model <- list( @@ -145,7 +147,7 @@ msqrobLm <- function(y, ## Put variance and degrees of freedom in appropriate slots for (i in seq_len(length(models))) { mydf <- hlp$df.prior + getDF(models[[i]]) - models[[i]]@varPosterior <- as.numeric(hlp$var.post[i]) + models[[i]]@varPosterior <- as.numeric(hlp$var.post[[i]]) models[[i]]@dfPosterior <- as.numeric(mydf) } @@ -209,6 +211,9 @@ msqrobLm <- function(y, #' ‘lmerControl’ documentation of the lme4 package for more details. #' Default is `list(control = lmerControl(calc.derivs = FALSE))` #' +#' @param keep.model `boolean(1)` to specify if the `lme4` fitted model should be kept in the +#' `$model` field of the output. Default is `FALSE`. +#' #' @examples #' #' # Load example data @@ -265,7 +270,8 @@ msqrobLmer <- function(y, maxitRob = 1, doQR = TRUE, featureGroups=NULL, - lmerArgs = list(control = lmerControl(calc.derivs = FALSE))){ + lmerArgs = list(control = lmerControl(calc.derivs = FALSE)), + keep.model = FALSE) { #Get the featureGroups variable if (is.null(featureGroups)){ @@ -283,64 +289,47 @@ msqrobLmer <- function(y, y <- split.data.frame(y, featureGroups) - if (ridge == TRUE){ - if(is.null(rowdata)){ - models <- bplapply(y, - FUN = .ridge_msqrobLmer, - "formula" = formula, - "coldata" = data, - "doQR" = doQR, - "robust"=robust, - "maxitRob" = maxitRob, - "tol" =tol) - } else{ - models <- bpmapply(FUN = .ridge_msqrobLmer, - y, rowdata, - MoreArgs = list("formula" = formula, - "coldata" = data, - "doQR" = doQR, - "robust"=robust, - "maxitRob" = maxitRob, - "tol" =tol)) - } - - }else{ - - if(is.null(rowdata)){ - models <- bplapply(y, - FUN = .noridge_msqrobLmer, - "formula" = formula, - "coldata" = data, - "robust"=robust, - "maxitRob" = maxitRob, - "tol" =tol) - } else{ - models <- bpmapply(FUN = .noridge_msqrobLmer, - y, rowdata, - MoreArgs = list("formula" = formula, - "coldata" = data, - "robust"=robust, - "maxitRob" = maxitRob, - "tol" =tol)) - } + fit_args <- list( + formula = formula, + coldata = data, + robust = robust, + maxitRob = maxitRob, + tol = tol, + keep.model = keep.model + ) + if (ridge) { + fit_func <- .ridge_msqrobLmer + fit_args$doQR <- doQR + } else { + fit_func <- .noridge_msqrobLmer + } + models <- if(is.null(rowdata)){ + do.call(bplapply, append(list(y, FUN=fit_func), fit_args)) + } else { + bpmapply(FUN = fit_func, y, rowdata, MoreArgs = fit_args) } - - + modelVars <- vapply(models, getVar, numeric(1)) + modelDfs <- vapply(models, getDF, numeric(1)) + modelMask <- !is.na(modelVars) & !is.na(modelDfs) + varPosterior <- rep_along(models, NA_real_) hlp <- limma::squeezeVar( - var = vapply(models, getVar, numeric(1)), - df = vapply(models, getDF, numeric(1)) + var = modelVars[modelMask], + df = modelDfs[modelMask], ) + varPosterior[modelMask] <- hlp$var.post for (i in seq_len(length(models))) { - models[[i]]@varPosterior <- as.numeric(hlp$var.post[i]) + models[[i]]@varPosterior <- as.numeric(varPosterior[[i]]) models[[i]]@dfPosterior <- as.numeric(hlp$df.prior + getDF(models[[i]])) } return(models) } ## Fit the mixed models with ridge regression -.ridge_msqrobLmer <- function(y,rowdata=NULL,formula,coldata, doQR, robust,maxitRob=1,tol = 1e-06){ +.ridge_msqrobLmer <- function(y, rowdata=NULL, formula, coldata, doQR, + robust, maxitRob=1, tol = 1e-06, + keep.model = FALSE){ #Create the matrix containing the variable information data <- .create_data(y,rowdata,coldata) @@ -366,7 +355,7 @@ msqrobLmer <- function(y, formula <- formula(y ~ (1|ridge)) } else { if (nobars(formula)[[2]] != ~1){ - #udpate formula to remove any fixed effect variables and replace with ridge + # update formula to remove any fixed effect variables and replace with ridge formula <- formula( paste0("y ~ (1|ridge) + ", paste0("(",paste(findbars(formula), collapse=")+("),")"))) } else { @@ -458,17 +447,20 @@ msqrobLmer <- function(y, } }, silent = TRUE) - model <- .create_model(betas, vcovUnscaled, sigma, df.residual, w, model) + model <- .create_model(betas, vcovUnscaled, sigma, df.residual, + model, keep.model = keep.model) } return(StatModel(type = type, params = model, - varPosterior = as.numeric(NA), - dfPosterior = as.numeric(NA))) + varPosterior = NA_real_, + dfPosterior = NA_real_)) } ## Fit the mixed models without ridge regression -.noridge_msqrobLmer <- function(y,rowdata=NULL,formula,coldata, robust,maxitRob=0, tol = 1e-06 ){ +.noridge_msqrobLmer <- function(y, rowdata=NULL, formula, coldata, + robust, maxitRob=0, tol = 1e-06, + keep.model = FALSE){ #Create the matrix containing the variable information data <- .create_data(y,rowdata,coldata) @@ -514,13 +506,14 @@ msqrobLmer <- function(y, } }, silent = TRUE) - model <- .create_model(betas, vcovUnscaled, sigma, df.residual, w, model) + model <- .create_model(betas, vcovUnscaled, sigma, df.residual, + model, keep.model = keep.model) } return(StatModel(type = type, params = model, - varPosterior = as.numeric(NA), - dfPosterior = as.numeric(NA))) + varPosterior = NA_real_, + dfPosterior = NA_real_)) } @@ -590,6 +583,7 @@ msqrobLmer <- function(y, betaB } +#' Calculate the weighted REML residual degrees of freedom #' @importFrom stats resid .getDfLmer <- function(object) { w <- object@frame$"(weights)" @@ -611,21 +605,24 @@ msqrobLmer <- function(y, return(model) } -.create_model <- function(betas, vcovUnscaled, sigma, df.residual, w, model){ +.create_model <- function(betas, vcovUnscaled, sigma, df.residual, model, keep.model = FALSE){ if (df.residual<2L){ - model <- list(coefficients = NA, - vcovUnscaled = NA, - sigma = NA, - df.residual = NA, - w = NA) + res <- list(coefficients = NA, + vcovUnscaled = NA, + sigma = NA, + df.residual = NA, + w = NA) } else { - model <- list(coefficients = betas, - vcovUnscaled = vcovUnscaled, - sigma = sigma, - df.residual = df.residual, - w = model@frame$`(weights)`) + res <- list(coefficients = betas, + vcovUnscaled = vcovUnscaled, + sigma = sigma, + df.residual = df.residual, + w = model@frame$`(weights)`) } - return(model) + if (keep.model) { + res$model <- model + } + return(res) } diff --git a/R/topFeatures.R b/R/topFeatures.R index e4a6c51..934849d 100644 --- a/R/topFeatures.R +++ b/R/topFeatures.R @@ -13,7 +13,23 @@ #' Options, in increasing conservatism, include ‘"none"’, #' ‘"BH"’, ‘"BY"’ and ‘"holm"’. See ‘p.adjust’ for the complete #' list of options. Default is "BH" the Benjamini-Hochberg method -#' to controle the False Discovery Rate (FDR). +#' to control the *False Discovery Rate* (FDR). +#' @param ddf.method `character` specifying the method to calculate the +#' *denominator degrees of freedom* (DoF) for the t-test. +#' Options are: +#' - "residual": (default) use the posterior residual DoF (\code{getDfPosterior}), +#' which was the only DDoF method for *msqrob2* before v2.XX. +#' It can significantly overestimate the denominator DoF, +#' resulting in overly significant p-values. +#' - "ML1": use the `dof_ml1` function from the *parameters* package for +#' heuristic approximation of the DDoF, which is much faster than +#' the Kenward-Roger method, while still providing an adequate approximation. +#' - "KenwardRoger": use the `dof_kenward` function from the *parameters* package for +#' *Kenward-Roger approximation* of the DDoF. This is the most accurate, but +#' also most computationally intensive method. +#' - "Satterthwaite": use the `dof_satterthwaite` function from the *parameters* package for +#' *Satterthwaite approximation* of the DDoF. This is a faster, but less accurate +#' alternative to *Kenward-Roger* method, which may not work well for small sample sizes. #' @param sort `boolean(1)` to indicate if the features have to be sorted according #' to statistical significance. #' @param alpha `numeric` specifying the cutoff value for adjusted p-values. @@ -42,8 +58,14 @@ #' @rdname topTable #' @author Lieven Clement #' @export - -topFeatures <- function(models, contrast, adjust.method = "BH", sort = TRUE, alpha = 1) { +topFeatures <- function( + models, + contrast, + adjust.method = "BH", + ddf.method = c("residual", "ML1", "KenwardRoger", "Satterthwaite"), + sort = TRUE, + alpha = 1 +) { if (!is(contrast, "matrix")) { contrast <- as.matrix(contrast) } @@ -63,11 +85,40 @@ topFeatures <- function(models, contrast, adjust.method = "BH", sort = TRUE, alp numeric(1), L = contrast )) - df <- vapply(models, getDfPosterior, numeric(1)) + ddf.method <- match.arg(ddf.method) + if (ddf.method == "residual") { + df <- vapply(models, getDfPosterior, numeric(1)) + } else if (ddf.method %in% c("KenwardRoger", "ML1", "Satterthwaite")) { + if (!requireNamespace("parameters", quietly = TRUE)) { + stop("parameters package is required to calculate the denominator DoF using ", ddf.method, " method.") + } + ddf_func <- if (ddf.method == "KenwardRoger") { + parameters::dof_kenward + } else if (ddf.method == "ML1") { + parameters::dof_ml1 + } else if (ddf.method == "Satterthwaite") { + parameters::dof_satterthwaite + } else { + stop("Unsupported ddf.method=", ddf.method) + } + df <- bplapply(models, function(model) { + if ("model" %in% names(model@params) && is(model@params$model, "lmerMod")) { + lmm <- model@params$model + tryCatch(min(ddf_func(lmm)[rownames(contrast)]), + error = function(e) NA_real_) + } else { + NA_real_ + } + }) + df <- unlist(df, recursive = FALSE, use.names = FALSE) + } else { + stop("Unsupported ddf.method=", ddf.method) + } t <- logFC / se pval <- pt(-abs(t), df) * 2 adjPval <- p.adjust(pval, method = adjust.method) out <- data.frame(logFC, se, df, t, pval, adjPval) + rownames(out) <- names(models) if (alpha < 1) { signif <- adjPval < alpha out <- na.exclude(out[signif, ]) diff --git a/man/msqrobLmer.Rd b/man/msqrobLmer.Rd index 7d42f81..cb6d6bd 100644 --- a/man/msqrobLmer.Rd +++ b/man/msqrobLmer.Rd @@ -15,7 +15,8 @@ msqrobLmer( maxitRob = 1, doQR = TRUE, featureGroups = NULL, - lmerArgs = list(control = lmerControl(calc.derivs = FALSE)) + lmerArgs = list(control = lmerControl(calc.derivs = FALSE)), + keep.model = FALSE ) } \arguments{ @@ -61,6 +62,9 @@ containing control parameters, including the nonlinear optimizer to be used and parameters to be passed through to the nonlinear optimizer, see the ‘lmerControl’ documentation of the lme4 package for more details. Default is \code{list(control = lmerControl(calc.derivs = FALSE))}} + +\item{keep.model}{\code{boolean(1)} to specify if the \code{lme4} fitted model should be kept in the +\verb{$model} field of the output. Default is \code{FALSE}.} } \value{ A list of objects of the \code{StatModel} class. diff --git a/man/topTable.Rd b/man/topTable.Rd index 8d3cc76..77c44b8 100644 --- a/man/topTable.Rd +++ b/man/topTable.Rd @@ -4,7 +4,14 @@ \alias{topFeatures} \title{Toplist of DE proteins, peptides or features} \usage{ -topFeatures(models, contrast, adjust.method = "BH", sort = TRUE, alpha = 1) +topFeatures( + models, + contrast, + adjust.method = "BH", + ddf.method = c("residual", "ML1", "KenwardRoger", "Satterthwaite"), + sort = TRUE, + alpha = 1 +) } \arguments{ \item{models}{A list with elements of the class \code{StatModel} that are @@ -20,7 +27,24 @@ the p-values for multiple testing. Options, in increasing conservatism, include ‘"none"’, ‘"BH"’, ‘"BY"’ and ‘"holm"’. See ‘p.adjust’ for the complete list of options. Default is "BH" the Benjamini-Hochberg method -to controle the False Discovery Rate (FDR).} +to control the \emph{False Discovery Rate} (FDR).} + +\item{ddf.method}{\code{character} specifying the method to calculate the +\emph{denominator degrees of freedom} (DoF) for the t-test. +Options are: +- "residual": (default) use the posterior residual DoF (\code{getDfPosterior}), +which was the only DDoF method for \emph{msqrob2} before v2.XX. +It can significantly overestimate the denominator DoF, +resulting in overly significant p-values. +- "ML1": use the \code{dof_ml1} function from the \emph{parameters} package for +heuristic approximation of the DDoF, which is much faster than +the Kenward-Roger method, while still providing an adequate approximation. +- "KenwardRoger": use the \code{dof_kenward} function from the \emph{parameters} package for +\emph{Kenward-Roger approximation} of the DDoF. This is the most accurate, but +also most computationally intensive method. +- "Satterthwaite": use the \code{dof_satterthwaite} function from the \emph{parameters} package for +\emph{Satterthwaite approximation} of the DDoF. This is a faster, but less accurate +alternative to \emph{Kenward-Roger} method, which may not work well for small sample sizes.} \item{sort}{\code{boolean(1)} to indicate if the features have to be sorted according to statistical significance.}