From 3ff5a2ead9a5a45a1c7d16d9a5825349fb6083c3 Mon Sep 17 00:00:00 2001 From: DANIELA RODRIGUES Date: Fri, 24 Mar 2023 20:19:17 +0000 Subject: [PATCH 1/3] Included the option to personalise adjustment set for each candidate variable --- R/varimpact.R | 14 ++++++---- R/vim-numerics.R | 28 +++++++++++++++---- tests/testthat/test-vim-numerics.R | 44 ++++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 11 deletions(-) create mode 100644 tests/testthat/test-vim-numerics.R diff --git a/R/varimpact.R b/R/varimpact.R index 4c71501..47a7367 100644 --- a/R/varimpact.R +++ b/R/varimpact.R @@ -71,6 +71,8 @@ #' @param verbose_reduction Boolean - if TRUE, will display more detail during #' variable reduction step (clustering). #' @param digits Number of digits to round the value labels. +#' @param adjustment_exclusions List of variables to be removed from the adjustment set +#' of each variable for which we want to estimate importance. #' #' @return Results object. TODO: add more detail here. #' @@ -209,7 +211,8 @@ varimpact = verbose_tmle = FALSE, verbose_reduction = FALSE, parallel = TRUE, - digits = 4L) { + digits = 4L, + adjustment_exclusions = list()) { # Time the full function execution. time_start = proc.time() @@ -255,9 +258,9 @@ varimpact = ######## # Applied to Explanatory (X) data frame - sna = sapply(data, sum_na) + sna = sapply(X, sum_na) - n = nrow(data) + n = nrow(X) ####### # Missing proportion by variable. @@ -330,7 +333,8 @@ varimpact = adjust_cutoff = adjust_cutoff, verbose = verbose, verbose_tmle = verbose_tmle, - verbose_reduction = verbose_reduction) + verbose_reduction = verbose_reduction, + adjustment_exclusions= adjustment_exclusions) # Combine the separate continuous and factor results. results = @@ -365,6 +369,6 @@ varimpact = # Set a custom class so that we can override print and summary. class(results) = "varimpact" - + invisible(results) } diff --git a/R/vim-numerics.R b/R/vim-numerics.R index 374945b..bf7b341 100644 --- a/R/vim-numerics.R +++ b/R/vim-numerics.R @@ -13,6 +13,7 @@ vim_numerics = Qbounds, corthres, adjust_cutoff, + adjustment_exclusions, verbose = FALSE, verbose_tmle = FALSE, verbose_reduction = FALSE) { @@ -230,12 +231,26 @@ vim_numerics = numerics$miss.cont, factors$datafac.dumW, factors$miss.fac) - + + #print(nameA) + #print(var_i) + #print(colnames(W)) + # Remove any columns in which all values are NA. # CK: but we're using imputed data, so there should be no NAs actually. # (With the exception of the NA vectors possibly added above. W = W[, !apply(is.na(W), 2, all), drop = FALSE] + #print(nameA) + + # Remove variables from the adjustment set of each variable for which we want to measure importance + if (nameA %in% names(adjustment_exclusions) && length(adjustment_exclusions[[nameA]]) > 0) { + W = W[, setdiff(colnames(W), adjustment_exclusions[[nameA]]), drop = FALSE] + } + + #print(colnames(W)) + #print(class(W)) + # Separate adjustment matrix into the training and test folds. Wt = W[folds != fold_k, , drop = FALSE] Wv = W[folds == fold_k, , drop = FALSE] @@ -344,6 +359,7 @@ vim_numerics = # Create a list to hold the results for this level. bin_result = list( name = nameA, + W_names = colnames(W), cv_fold = fold_k, level = bin_j, level_label = At_bin_labels[bin_j], @@ -386,7 +402,7 @@ vim_numerics = # Save how many obs have this level/bin in this training fold. bin_result$train_cell_size = sum(IA) - + ############################################### # TODO: send to calculate_estimates first, which would calculate # TMLE, IPTW, G-Comp, and Unadj estimates. @@ -466,18 +482,18 @@ vim_numerics = } bin_result$test_msg = "success" - + # Save to the main list. bin_results[[bin_j]] = bin_result } - + # Finished looping over each level of the assignment variable # (primarily training fold, but now also some val fold work). if (verbose) cat(" done.\n") # Save individual bin results. fold_result$bin_results = bin_results - + # Create a dataframe version of the bin results. fold_result$bin_df = do.call(rbind, lapply(bin_results, function(result) { @@ -578,7 +594,7 @@ vim_numerics = "label = ", training_estimates[[minj]]$label, ")") } fold_result$message = message - + if (verbose) { cat(message, "\n") } diff --git a/tests/testthat/test-vim-numerics.R b/tests/testthat/test-vim-numerics.R new file mode 100644 index 0000000..b83f7b2 --- /dev/null +++ b/tests/testthat/test-vim-numerics.R @@ -0,0 +1,44 @@ +## Test + +test_that("adjustment set functionality works appropriately", { + + library(varimpact) + library(SuperLearner) + library(devtools) + library(testthat) + + # Data setup taken from Chris Kennedy Github page (https://github.com/ck37/varimpact) + set.seed(1, "L'Ecuyer-CMRG") + N <- 300 + num_normal <- 5 + X <- as.data.frame(matrix(rnorm(N * num_normal), N, num_normal)) + Y <- rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + .1*X[, 3]*X[, 4] - .2*abs(X[, 4]))) + # Add some missing data to X so we can test imputation. + for (i in 1:10) X[sample(nrow(X), 1), sample(ncol(X), 1)] <- NA + + Q_lib = c("SL.mean", "SL.glmnet", "SL.ranger", "SL.rpartPrune") + g_lib = c("SL.mean", "SL.glmnet") + + #vim = varimpact(Y = Y, data = X, Q.library = Q_lib, g.library = g_lib) + + vim = varimpact(Y = Y, data = X, Q.library = Q_lib, g.library = g_lib, adjustment_exclusions = list("V1" = c("V2","V3"), "V3" = c("V1"), "V5" = c("V1","V2","V3"))) + + # V1 + expect_equal(vim$all_vims$V1$fold_results[[1]]$bin_results[[1]]$W_names, c("V4","V5","Imiss_V1","Imiss_V2","Imiss_V3","Imiss_V5")) + + # V2 + expect_equal(vim$all_vims$V2$fold_results[[1]]$bin_results[[1]]$W_names, c("V1","V3","V4","V5","Imiss_V1","Imiss_V2","Imiss_V3","Imiss_V5")) + + # V3 + expect_equal(vim$all_vims$V3$fold_results[[1]]$bin_results[[1]]$W_names, c("V2","V4","V5","Imiss_V1","Imiss_V2","Imiss_V3","Imiss_V5")) + + # V4 + expect_equal(vim$all_vims$V4$fold_results[[1]]$bin_results[[1]]$W_names, c("V1","V2","V3","V5","Imiss_V1","Imiss_V2","Imiss_V3","Imiss_V5")) + + # V5 + expect_equal(vim$all_vims$V5$fold_results[[1]]$bin_results[[1]]$W_names, c("V4","Imiss_V1","Imiss_V2","Imiss_V3","Imiss_V5")) + + }) + + + From 7d1f85b3afa376e0b8e352626be9678449555625 Mon Sep 17 00:00:00 2001 From: DANIELA RODRIGUES Date: Fri, 24 Mar 2023 21:17:48 +0000 Subject: [PATCH 2/3] test --- R/varimpact.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/varimpact.R b/R/varimpact.R index 47a7367..8a43628 100644 --- a/R/varimpact.R +++ b/R/varimpact.R @@ -257,7 +257,7 @@ varimpact = } ######## - # Applied to Explanatory (X) data frame + # Applied to Explanatory (X) data frame. sna = sapply(X, sum_na) n = nrow(X) From 7b6f8f25d43ce37b818bb856f042c0af96940843 Mon Sep 17 00:00:00 2001 From: DANIELA RODRIGUES Date: Fri, 24 Mar 2023 21:23:27 +0000 Subject: [PATCH 3/3] changed "X" back to "data" in varimpact (initially changed it by mistake) --- R/varimpact.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/varimpact.R b/R/varimpact.R index 8a43628..685f20b 100644 --- a/R/varimpact.R +++ b/R/varimpact.R @@ -258,9 +258,9 @@ varimpact = ######## # Applied to Explanatory (X) data frame. - sna = sapply(X, sum_na) + sna = sapply(data, sum_na) - n = nrow(X) + n = nrow(data) ####### # Missing proportion by variable.