diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R deleted file mode 100644 index 44d49962..00000000 --- a/DIMS/MakeInit.R +++ /dev/null @@ -1,31 +0,0 @@ -## adapted from makeInit in old pipeline - -# define parameters -args <- commandArgs(trailingOnly = TRUE) - -sample_sheet <- read.csv(args[1], sep = "\t") -nr_replicates <- as.numeric(args[2]) - -sample_names <- trimws(as.vector(unlist(sample_sheet[1]))) -nr_sample_groups <- length(sample_names) / nr_replicates -group_names <- trimws(as.vector(unlist(sample_sheet[2]))) -group_names <- gsub("[^-.[:alnum:]]", "_", group_names) -group_names_unique <- unique(group_names) - -# generate the replication pattern -repl_pattern <- c() -for (sample_group in 1:nr_sample_groups) { - tmp <- c() - for (repl in nr_replicates:1) { - index <- ((sample_group * nr_replicates) - repl) + 1 - tmp <- c(tmp, sample_names[index]) - } - repl_pattern <- c(repl_pattern, list(tmp)) -} - -names(repl_pattern) <- group_names_unique - -# preview the replication pattern -print(tail(repl_pattern)) - -save(repl_pattern, file = "init.RData") diff --git a/DIMS/MakeInit.nf b/DIMS/MakeInit.nf deleted file mode 100644 index 7aae0e46..00000000 --- a/DIMS/MakeInit.nf +++ /dev/null @@ -1,18 +0,0 @@ -process MakeInit { - tag "DIMS MakeInit" - label 'MakeInit' - container = 'docker://umcugenbioinf/dims:1.3' - shell = ['/bin/bash', '-euo', 'pipefail'] - - input: - path(samplesheet) - val(nr_replicates) - - output: - path('init.RData') - - script: - """ - Rscript ${baseDir}/CustomModules/DIMS/MakeInit.R $samplesheet $nr_replicates - """ -} diff --git a/DIMS/ParseSamplesheet.R b/DIMS/ParseSamplesheet.R new file mode 100644 index 00000000..806486fb --- /dev/null +++ b/DIMS/ParseSamplesheet.R @@ -0,0 +1,19 @@ +# define parameters +args <- commandArgs(trailingOnly = TRUE) + +sample_sheet <- as.data.frame(read.csv(args[1], sep = "\t")) +preprocessing_scripts_dir <- args[2] + +# load in function script +source(paste0(preprocessing_scripts_dir, "parse_samplesheet_functions.R")) + +# generate the replication pattern +repl_pattern <- generate_repl_pattern(sample_sheet) + +# write the replication pattern to text file for troubleshooting purposes +sink("replication_pattern.txt") +print(repl_pattern) +sink() + +# save replication pattern to file +save(repl_pattern, file = "init.RData") diff --git a/DIMS/ParseSamplesheet.nf b/DIMS/ParseSamplesheet.nf new file mode 100644 index 00000000..6982c1f0 --- /dev/null +++ b/DIMS/ParseSamplesheet.nf @@ -0,0 +1,18 @@ +process ParseSamplesheet { + tag "DIMS ParseSamplesheet" + label 'ParseSamplesheet' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(samplesheet) + + output: + path('init.RData') + path('replication_pattern.txt') + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/ParseSamplesheet.R $samplesheet $params.preprocessing_scripts_dir + """ +} diff --git a/DIMS/preprocessing/parse_samplesheet_functions.R b/DIMS/preprocessing/parse_samplesheet_functions.R new file mode 100644 index 00000000..32c5cb5e --- /dev/null +++ b/DIMS/preprocessing/parse_samplesheet_functions.R @@ -0,0 +1,29 @@ +# function for parse_samplesheet +generate_repl_pattern <- function(sample_sheet) { + #' Generate replication pattern list based on information in sample_sheet + #' + #' @param sample_names: vector of sample names (vector of strings) + #' @param sample_sheet: matrix of file names and sample names + #' + #' @return ints_sorted: list of sample names with corresponding file names (technical replicates) + + # get the right columns from the samplesheet + file_name_col <- grep("File_Name|File Name", colnames(sample_sheet)) + sample_name_col <- grep("Sample_Name|Sample Name", colnames(sample_sheet)) + # get the unique sample names from the samplesheet + sample_names <- sort(unique(trimws(as.vector(unlist(sample_sheet[sample_name_col]))))) + # remove all characters from sample_names which are not letters, numbers, hyphens and periods + sample_names <- gsub("[^-.[:alnum:]]", "_", sample_names) + + # create replication pattern (which technical replicates belong to which sample) + repl_pattern <- c() + for (sample_group in sample_names) { + file_indices <- which(sample_sheet[, sample_name_col] == sample_group) + file_names <- sample_sheet[file_indices, file_name_col] + repl_pattern <- c(repl_pattern, list(file_names)) + } + names(repl_pattern) <- sample_names + + return(repl_pattern) +} + diff --git a/DIMS/tests/testthat/parse_samplesheet_functions.R b/DIMS/tests/testthat/parse_samplesheet_functions.R new file mode 100644 index 00000000..0437f5ee --- /dev/null +++ b/DIMS/tests/testthat/parse_samplesheet_functions.R @@ -0,0 +1,24 @@ +# unit tests for ParseSamplesheet +# function: generate_repl_pattern + +# source all functions for PeakGrouping +source("../../preprocessing/parse_samplesheet_functions.R") + +# test generate_repl_pattern +testthat::test_that("replication pattern is correctly generated", { + # create sample sheet tot test on: + test_file_names <- paste0(rep("RES_20260101_", 6), sprintf("%03d", 1:6)) + test_sample_names <- sort(rep(c("C1", "P2", "P3"), 2)) + test_sample_sheet <- as.data.frame(cbind(File_Name = test_file_names, Sample_Name = test_sample_names)) + + # test that a list of length 3 is generated + expect_length(generate_repl_pattern(test_sample_sheet), 3) + # test list names + expect_equal(names(generate_repl_pattern(test_sample_sheet)), unique(test_sample_names), TRUE) + + # test what happens if any sample name is used twice + test_sample_names <- gsub("P3", "P2", test_sample_names) + test_sample_sheet <- as.data.frame(cbind(File_Name = test_file_names, Sample_Name = test_sample_names)) + expect_length(generate_repl_pattern(test_sample_sheet), 2) + expect_length(generate_repl_pattern(test_sample_sheet)$P2, 4) +})