From e1e144e2f0cb4c076cdc8c11d4c99109c91c3adf Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Mon, 9 Feb 2026 11:56:54 +0100
Subject: [PATCH 1/5] created function for generating replication pattern from
 sample sheet

---
 DIMS/MakeInit.R                               | 32 ++++++-------------
 DIMS/MakeInit.nf                              |  3 +-
 .../parse_samplesheet_functions.R             | 29 +++++++++++++++++
 3 files changed, 40 insertions(+), 24 deletions(-)
 create mode 100644 DIMS/preprocessing/parse_samplesheet_functions.R

diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R
index 44d49962..806486fb 100644
--- a/DIMS/MakeInit.R
+++ b/DIMS/MakeInit.R
@@ -1,31 +1,19 @@
-## adapted from makeInit in old pipeline
-
 # define parameters
 args <- commandArgs(trailingOnly = TRUE)
 
-sample_sheet <- read.csv(args[1], sep = "\t")
-nr_replicates <- as.numeric(args[2])
+sample_sheet <- as.data.frame(read.csv(args[1], sep = "\t"))
+preprocessing_scripts_dir <- args[2]
 
-sample_names <- trimws(as.vector(unlist(sample_sheet[1])))
-nr_sample_groups <- length(sample_names) / nr_replicates
-group_names <- trimws(as.vector(unlist(sample_sheet[2])))
-group_names <- gsub("[^-.[:alnum:]]", "_", group_names)
-group_names_unique <- unique(group_names)
+# load in function script
+source(paste0(preprocessing_scripts_dir, "parse_samplesheet_functions.R"))
 
 # generate the replication pattern
-repl_pattern <- c()
-for (sample_group in 1:nr_sample_groups) {
-  tmp <- c()
-  for (repl in nr_replicates:1) {
-    index <- ((sample_group * nr_replicates) - repl) + 1
-    tmp <- c(tmp, sample_names[index])
-  }
-  repl_pattern <- c(repl_pattern, list(tmp))
-}
-
-names(repl_pattern) <- group_names_unique
+repl_pattern <- generate_repl_pattern(sample_sheet)
 
-# preview the replication pattern
-print(tail(repl_pattern))
+# write the replication pattern to text file for troubleshooting purposes
+sink("replication_pattern.txt")
+print(repl_pattern)
+sink()
 
+# save replication pattern to file
 save(repl_pattern, file = "init.RData")
diff --git a/DIMS/MakeInit.nf b/DIMS/MakeInit.nf
index 7aae0e46..6f75fa54 100644
--- a/DIMS/MakeInit.nf
+++ b/DIMS/MakeInit.nf
@@ -6,13 +6,12 @@ process MakeInit {
 
     input:
        path(samplesheet) 
-       val(nr_replicates)
 
     output:
        path('init.RData')
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/MakeInit.R $samplesheet $nr_replicates 
+        Rscript ${baseDir}/CustomModules/DIMS/MakeInit.R $samplesheet $params.preprocessing_scripts_dir
         """
 }
diff --git a/DIMS/preprocessing/parse_samplesheet_functions.R b/DIMS/preprocessing/parse_samplesheet_functions.R
new file mode 100644
index 00000000..32c5cb5e
--- /dev/null
+++ b/DIMS/preprocessing/parse_samplesheet_functions.R
@@ -0,0 +1,29 @@
+# function for parse_samplesheet
+generate_repl_pattern <- function(sample_sheet) {
+  #' Generate replication pattern list based on information in sample_sheet
+  #'
+  #' @param sample_names: vector of sample names (vector of strings)
+  #' @param sample_sheet: matrix of file names and sample names
+  #'
+  #' @return ints_sorted: list of sample names with corresponding file names (technical replicates)
+
+  # get the right columns from the samplesheet
+  file_name_col <- grep("File_Name|File Name", colnames(sample_sheet))
+  sample_name_col <- grep("Sample_Name|Sample Name", colnames(sample_sheet))
+  # get the unique sample names from the samplesheet
+  sample_names <- sort(unique(trimws(as.vector(unlist(sample_sheet[sample_name_col])))))
+  # remove all characters from sample_names which are not letters, numbers, hyphens and periods
+  sample_names <- gsub("[^-.[:alnum:]]", "_", sample_names)
+
+  # create replication pattern (which technical replicates belong to which sample)
+  repl_pattern <- c()
+  for (sample_group in sample_names) {
+    file_indices <- which(sample_sheet[, sample_name_col] == sample_group)
+    file_names <- sample_sheet[file_indices, file_name_col]
+    repl_pattern <- c(repl_pattern, list(file_names))
+  }
+  names(repl_pattern) <- sample_names
+
+  return(repl_pattern)
+}
+

From f8fbc4952e78a30bbd2e5fcbaa8597e85712d228 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Mon, 9 Feb 2026 12:05:01 +0100
Subject: [PATCH 2/5] renamed MakeInit step to ParseSamplesheet

---
 DIMS/{MakeInit.R => ParseSamplesheet.R}   | 0
 DIMS/{MakeInit.nf => ParseSamplesheet.nf} | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename DIMS/{MakeInit.R => ParseSamplesheet.R} (100%)
 rename DIMS/{MakeInit.nf => ParseSamplesheet.nf} (100%)

diff --git a/DIMS/MakeInit.R b/DIMS/ParseSamplesheet.R
similarity index 100%
rename from DIMS/MakeInit.R
rename to DIMS/ParseSamplesheet.R
diff --git a/DIMS/MakeInit.nf b/DIMS/ParseSamplesheet.nf
similarity index 100%
rename from DIMS/MakeInit.nf
rename to DIMS/ParseSamplesheet.nf

From c4d4cf18eb2df61af8c969808a3cb8ec5cf17c88 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Mon, 9 Feb 2026 12:06:42 +0100
Subject: [PATCH 3/5] added unit tests for parse_samplesheet_functions

---
 .../testthat/parse_samplesheet_functions.R    | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 DIMS/tests/testthat/parse_samplesheet_functions.R

diff --git a/DIMS/tests/testthat/parse_samplesheet_functions.R b/DIMS/tests/testthat/parse_samplesheet_functions.R
new file mode 100644
index 00000000..0437f5ee
--- /dev/null
+++ b/DIMS/tests/testthat/parse_samplesheet_functions.R
@@ -0,0 +1,24 @@
+# unit tests for ParseSamplesheet
+# function: generate_repl_pattern
+
+# source all functions for PeakGrouping
+source("../../preprocessing/parse_samplesheet_functions.R")
+
+# test generate_repl_pattern
+testthat::test_that("replication pattern is correctly generated", {
+  # create sample sheet tot test on:
+  test_file_names <- paste0(rep("RES_20260101_", 6), sprintf("%03d", 1:6))
+  test_sample_names <- sort(rep(c("C1", "P2", "P3"), 2))
+  test_sample_sheet <- as.data.frame(cbind(File_Name = test_file_names, Sample_Name = test_sample_names))
+
+  # test that a list of length 3 is generated
+  expect_length(generate_repl_pattern(test_sample_sheet), 3)
+  # test list names
+  expect_equal(names(generate_repl_pattern(test_sample_sheet)), unique(test_sample_names), TRUE)
+  
+  # test what happens if any sample name is used twice
+  test_sample_names <- gsub("P3", "P2", test_sample_names)
+  test_sample_sheet <- as.data.frame(cbind(File_Name = test_file_names, Sample_Name = test_sample_names))
+  expect_length(generate_repl_pattern(test_sample_sheet), 2)
+  expect_length(generate_repl_pattern(test_sample_sheet)$P2, 4)
+})

From 99860c503e0161bddea5c14f120195fb43477cec Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Mon, 9 Feb 2026 13:49:41 +0100
Subject: [PATCH 4/5] changed process name from MakeInit to ParseSamplesheet

---
 DIMS/ParseSamplesheet.nf | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/DIMS/ParseSamplesheet.nf b/DIMS/ParseSamplesheet.nf
index 6f75fa54..4de7c8fc 100644
--- a/DIMS/ParseSamplesheet.nf
+++ b/DIMS/ParseSamplesheet.nf
@@ -1,6 +1,6 @@
-process MakeInit {
-    tag "DIMS MakeInit"
-    label 'MakeInit'
+process ParseSamplesheet {
+    tag "DIMS ParseSamplesheet"
+    label 'ParseSamplesheet'
     container = 'docker://umcugenbioinf/dims:1.3'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
@@ -9,9 +9,10 @@ process MakeInit {
 
     output:
        path('init.RData')
+       path('technical_replicates.txt')
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/MakeInit.R $samplesheet $params.preprocessing_scripts_dir
+        Rscript ${baseDir}/CustomModules/DIMS/ParseSamplesheet.R $samplesheet $params.preprocessing_scripts_dir
         """
 }

From 6dd934eb407ca21369a11e2d39fcccba4c3786ab Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Mon, 9 Feb 2026 14:12:09 +0100
Subject: [PATCH 5/5] corrected file name for replication_pattern.txt

---
 DIMS/ParseSamplesheet.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DIMS/ParseSamplesheet.nf b/DIMS/ParseSamplesheet.nf
index 4de7c8fc..6982c1f0 100644
--- a/DIMS/ParseSamplesheet.nf
+++ b/DIMS/ParseSamplesheet.nf
@@ -9,7 +9,7 @@ process ParseSamplesheet {
 
     output:
        path('init.RData')
-       path('technical_replicates.txt')
+       path('replication_pattern.txt')
 
     script:
         """