From e1e144e2f0cb4c076cdc8c11d4c99109c91c3adf Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Mon, 9 Feb 2026 11:56:54 +0100 Subject: [PATCH 1/5] created function for generating replication pattern from sample sheet --- DIMS/MakeInit.R | 32 ++++++------------- DIMS/MakeInit.nf | 3 +- .../parse_samplesheet_functions.R | 29 +++++++++++++++++ 3 files changed, 40 insertions(+), 24 deletions(-) create mode 100644 DIMS/preprocessing/parse_samplesheet_functions.R diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R index 44d49962..806486fb 100644 --- a/DIMS/MakeInit.R +++ b/DIMS/MakeInit.R @@ -1,31 +1,19 @@ -## adapted from makeInit in old pipeline - # define parameters args <- commandArgs(trailingOnly = TRUE) -sample_sheet <- read.csv(args[1], sep = "\t") -nr_replicates <- as.numeric(args[2]) +sample_sheet <- as.data.frame(read.csv(args[1], sep = "\t")) +preprocessing_scripts_dir <- args[2] -sample_names <- trimws(as.vector(unlist(sample_sheet[1]))) -nr_sample_groups <- length(sample_names) / nr_replicates -group_names <- trimws(as.vector(unlist(sample_sheet[2]))) -group_names <- gsub("[^-.[:alnum:]]", "_", group_names) -group_names_unique <- unique(group_names) +# load in function script +source(paste0(preprocessing_scripts_dir, "parse_samplesheet_functions.R")) # generate the replication pattern -repl_pattern <- c() -for (sample_group in 1:nr_sample_groups) { - tmp <- c() - for (repl in nr_replicates:1) { - index <- ((sample_group * nr_replicates) - repl) + 1 - tmp <- c(tmp, sample_names[index]) - } - repl_pattern <- c(repl_pattern, list(tmp)) -} - -names(repl_pattern) <- group_names_unique +repl_pattern <- generate_repl_pattern(sample_sheet) -# preview the replication pattern -print(tail(repl_pattern)) +# write the replication pattern to text file for troubleshooting purposes +sink("replication_pattern.txt") +print(repl_pattern) +sink() +# save replication pattern to file save(repl_pattern, file = "init.RData") diff --git a/DIMS/MakeInit.nf b/DIMS/MakeInit.nf index 7aae0e46..6f75fa54 100644 --- a/DIMS/MakeInit.nf +++ b/DIMS/MakeInit.nf @@ -6,13 +6,12 @@ process MakeInit { input: path(samplesheet) - val(nr_replicates) output: path('init.RData') script: """ - Rscript ${baseDir}/CustomModules/DIMS/MakeInit.R $samplesheet $nr_replicates + Rscript ${baseDir}/CustomModules/DIMS/MakeInit.R $samplesheet $params.preprocessing_scripts_dir """ } diff --git a/DIMS/preprocessing/parse_samplesheet_functions.R b/DIMS/preprocessing/parse_samplesheet_functions.R new file mode 100644 index 00000000..32c5cb5e --- /dev/null +++ b/DIMS/preprocessing/parse_samplesheet_functions.R @@ -0,0 +1,29 @@ +# function for parse_samplesheet +generate_repl_pattern <- function(sample_sheet) { + #' Generate replication pattern list based on information in sample_sheet + #' + #' @param sample_names: vector of sample names (vector of strings) + #' @param sample_sheet: matrix of file names and sample names + #' + #' @return ints_sorted: list of sample names with corresponding file names (technical replicates) + + # get the right columns from the samplesheet + file_name_col <- grep("File_Name|File Name", colnames(sample_sheet)) + sample_name_col <- grep("Sample_Name|Sample Name", colnames(sample_sheet)) + # get the unique sample names from the samplesheet + sample_names <- sort(unique(trimws(as.vector(unlist(sample_sheet[sample_name_col]))))) + # remove all characters from sample_names which are not letters, numbers, hyphens and periods + sample_names <- gsub("[^-.[:alnum:]]", "_", sample_names) + + # create replication pattern (which technical replicates belong to which sample) + repl_pattern <- c() + for (sample_group in sample_names) { + file_indices <- which(sample_sheet[, sample_name_col] == sample_group) + file_names <- sample_sheet[file_indices, file_name_col] + repl_pattern <- c(repl_pattern, list(file_names)) + } + names(repl_pattern) <- sample_names + + return(repl_pattern) +} + From f8fbc4952e78a30bbd2e5fcbaa8597e85712d228 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Mon, 9 Feb 2026 12:05:01 +0100 Subject: [PATCH 2/5] renamed MakeInit step to ParseSamplesheet --- DIMS/{MakeInit.R => ParseSamplesheet.R} | 0 DIMS/{MakeInit.nf => ParseSamplesheet.nf} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename DIMS/{MakeInit.R => ParseSamplesheet.R} (100%) rename DIMS/{MakeInit.nf => ParseSamplesheet.nf} (100%) diff --git a/DIMS/MakeInit.R b/DIMS/ParseSamplesheet.R similarity index 100% rename from DIMS/MakeInit.R rename to DIMS/ParseSamplesheet.R diff --git a/DIMS/MakeInit.nf b/DIMS/ParseSamplesheet.nf similarity index 100% rename from DIMS/MakeInit.nf rename to DIMS/ParseSamplesheet.nf From c4d4cf18eb2df61af8c969808a3cb8ec5cf17c88 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Mon, 9 Feb 2026 12:06:42 +0100 Subject: [PATCH 3/5] added unit tests for parse_samplesheet_functions --- .../testthat/parse_samplesheet_functions.R | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 DIMS/tests/testthat/parse_samplesheet_functions.R diff --git a/DIMS/tests/testthat/parse_samplesheet_functions.R b/DIMS/tests/testthat/parse_samplesheet_functions.R new file mode 100644 index 00000000..0437f5ee --- /dev/null +++ b/DIMS/tests/testthat/parse_samplesheet_functions.R @@ -0,0 +1,24 @@ +# unit tests for ParseSamplesheet +# function: generate_repl_pattern + +# source all functions for PeakGrouping +source("../../preprocessing/parse_samplesheet_functions.R") + +# test generate_repl_pattern +testthat::test_that("replication pattern is correctly generated", { + # create sample sheet tot test on: + test_file_names <- paste0(rep("RES_20260101_", 6), sprintf("%03d", 1:6)) + test_sample_names <- sort(rep(c("C1", "P2", "P3"), 2)) + test_sample_sheet <- as.data.frame(cbind(File_Name = test_file_names, Sample_Name = test_sample_names)) + + # test that a list of length 3 is generated + expect_length(generate_repl_pattern(test_sample_sheet), 3) + # test list names + expect_equal(names(generate_repl_pattern(test_sample_sheet)), unique(test_sample_names), TRUE) + + # test what happens if any sample name is used twice + test_sample_names <- gsub("P3", "P2", test_sample_names) + test_sample_sheet <- as.data.frame(cbind(File_Name = test_file_names, Sample_Name = test_sample_names)) + expect_length(generate_repl_pattern(test_sample_sheet), 2) + expect_length(generate_repl_pattern(test_sample_sheet)$P2, 4) +}) From 99860c503e0161bddea5c14f120195fb43477cec Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Mon, 9 Feb 2026 13:49:41 +0100 Subject: [PATCH 4/5] changed process name from MakeInit to ParseSamplesheet --- DIMS/ParseSamplesheet.nf | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/DIMS/ParseSamplesheet.nf b/DIMS/ParseSamplesheet.nf index 6f75fa54..4de7c8fc 100644 --- a/DIMS/ParseSamplesheet.nf +++ b/DIMS/ParseSamplesheet.nf @@ -1,6 +1,6 @@ -process MakeInit { - tag "DIMS MakeInit" - label 'MakeInit' +process ParseSamplesheet { + tag "DIMS ParseSamplesheet" + label 'ParseSamplesheet' container = 'docker://umcugenbioinf/dims:1.3' shell = ['/bin/bash', '-euo', 'pipefail'] @@ -9,9 +9,10 @@ process MakeInit { output: path('init.RData') + path('technical_replicates.txt') script: """ - Rscript ${baseDir}/CustomModules/DIMS/MakeInit.R $samplesheet $params.preprocessing_scripts_dir + Rscript ${baseDir}/CustomModules/DIMS/ParseSamplesheet.R $samplesheet $params.preprocessing_scripts_dir """ } From 6dd934eb407ca21369a11e2d39fcccba4c3786ab Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Mon, 9 Feb 2026 14:12:09 +0100 Subject: [PATCH 5/5] corrected file name for replication_pattern.txt --- DIMS/ParseSamplesheet.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DIMS/ParseSamplesheet.nf b/DIMS/ParseSamplesheet.nf index 4de7c8fc..6982c1f0 100644 --- a/DIMS/ParseSamplesheet.nf +++ b/DIMS/ParseSamplesheet.nf @@ -9,7 +9,7 @@ process ParseSamplesheet { output: path('init.RData') - path('technical_replicates.txt') + path('replication_pattern.txt') script: """