diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R index d07fd203..e8beffb8 100644 --- a/DIMS/GenerateBreaks.R +++ b/DIMS/GenerateBreaks.R @@ -1,5 +1,3 @@ -## adapted from 1-generateBreaksFwhm.HPC.R ## - # load required package suppressPackageStartupMessages(library("xcms")) @@ -7,55 +5,19 @@ suppressPackageStartupMessages(library("xcms")) cmd_args <- commandArgs(trailingOnly = TRUE) filepath <- cmd_args[1] -outdir <- cmd_args[2] -trim <- as.numeric(cmd_args[3]) -resol <- as.numeric(cmd_args[4]) - -# initialize -trim_left_pos <- NULL -trim_right_pos <- NULL -trim_left_neg <- NULL -trim_right_neg <- NULL -breaks_fwhm <- NULL -breaks_fwhm_avg <- NULL -bins <- NULL +trim <- as.numeric(cmd_args[2]) +resol <- as.numeric(cmd_args[3]) # read in mzML file raw_data <- suppressMessages(xcms::xcmsRaw(filepath)) -# Get time values for positive and negative scans -pos_times <- raw_data@scantime[raw_data@polarity == "positive"] -neg_times <- raw_data@scantime[raw_data@polarity == "negative"] +# get trim parameters and save them to file +get_trim_parameters(raw_data@scantime, raw_data@polarity, trim) -# trim (remove) scans at the start and end for positive -trim_left_pos <- round(pos_times[length(pos_times) * (trim * 1.5)]) # 15% aan het begin -trim_right_pos <- round(pos_times[length(pos_times) * (1 - (trim * 0.5))]) # 5% aan het eind +# create breaks of bins for intensities. Bin size is a function of fwhm which is a function of m/z +get_breaks_for_bins(raw_data$mzrange, resol) -# trim (remove) scans at the start and end for negative -trim_left_neg <- round(neg_times[length(neg_times) * trim]) -trim_right_neg <- round(neg_times[length(neg_times) * (1 - trim)]) - -# Mass range m/z -low_mz <- raw_data@mzrange[1] +# Determine maximum m/z and save to file high_mz <- raw_data@mzrange[2] - -# determine number of segments (bins) -nr_segments <- 2 * (high_mz - low_mz) -segment <- seq(from = low_mz, to = high_mz, length.out = nr_segments + 1) - -# determine start and end of each bin. -for (i in 1:nr_segments) { - start_segment <- segment[i] - end_segment <- segment[i+1] - resol_mz <- resol * (1 / sqrt(2) ^ (log2(start_segment / 200))) - fwhm_segment <- start_segment / resol_mz - breaks_fwhm <- c(breaks_fwhm, seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment)) - # average the m/z instead of start value - range <- seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment) - delta_mz <- range[2] - range[1] - breaks_fwhm_avg <- c(breaks_fwhm_avg, range + 0.5 * delta_mz) -} - -# generate output file -save(breaks_fwhm, breaks_fwhm_avg, trim_left_pos, trim_right_pos, trim_left_neg, trim_right_neg, file = "breaks.fwhm.RData") save(high_mz, file = "highest_mz.RData") + diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf index af617a8d..e2dc2d46 100644 --- a/DIMS/GenerateBreaks.nf +++ b/DIMS/GenerateBreaks.nf @@ -7,13 +7,13 @@ process GenerateBreaks { input: tuple(val(file_id), path(mzML_file)) - output: path('breaks.fwhm.RData'), emit: breaks + path('trim_params.RData'), emit: trim_params path('highest_mz.RData'), emit: highest_mz script: """ - Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R $mzML_file ./ $params.trim $params.resolution + Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R $mzML_file $params.trim $params.resolution """ } diff --git a/DIMS/preprocessing/generate_breaks_functions.R b/DIMS/preprocessing/generate_breaks_functions.R new file mode 100644 index 00000000..6ffe09e9 --- /dev/null +++ b/DIMS/preprocessing/generate_breaks_functions.R @@ -0,0 +1,58 @@ +# GenerateBreaks functions +get_trim_parameters <- function(scantimes, polarities, trim = 0.1) { + #' determine the scans per scanmode which are trimmed off; save trim parameters to file + #' + #' @param scantimes: vector of scan times in seconds + #' @param polarities: vector of polarities (positive or negative) + + # Get time values for positive and negative scans + pos_times <- scantimes[polarities == "positive"] + neg_times <- scantimes[polarities == "negative"] + + # trim (remove) scans at the start (15%) and end (5%) for positive + trim_left_pos <- round(pos_times[length(pos_times) * (trim * 1.5)]) + trim_right_pos <- round(pos_times[length(pos_times) * (1 - (trim * 0.5))]) + + # trim (remove) scans at the start and end (10%) for negative + trim_left_neg <- round(neg_times[length(neg_times) * trim]) + trim_right_neg <- round(neg_times[length(neg_times) * (1 - trim)]) + + # save trim parameters to file + save(trim_left_pos, trim_right_pos, trim_left_neg, trim_right_neg, file = "trim_params.RData") +} + +get_breaks_for_bins <- function(mzrange, resol = 140000) { + #' create a vector with the breaks in m/z of bins for intensities + #' + #' @param mzrange: vector of minimum and maximum m/z values (integeers) + #' @param resol: value for resolution (integer) + + # initialize + breaks_fwhm <- NULL + breaks_fwhm_avg <- NULL + + # determine number of segments used to create bins + nr_segments <- 2 * (mzrange[2] - mzrange[1]) + segments <- seq(from = mzrange[1], to = mzrange[2], length.out = nr_segments + 1) + + # determine start and end of each bin. fwhm (width of peaks) is assumed to be constant within a segment + for (segment_index in 1:nr_segments) { + start_segment <- segments[segment_index] + end_segment <- segments[segment_index + 1] + # determine resolution at given m/z value + resol_mz <- resol * (1 / sqrt(2) ^ (log2(start_segment / 200))) + # determine fwhm (full width at half maximum) of the peaks in this segment + fwhm_segment <- start_segment / resol_mz + # determine the breaks within this segment + breaks_segment <- seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment) + # add breaks for this segment to vector with all breaks + breaks_fwhm <- c(breaks_fwhm, seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment)) + # get a vector of average m/z instead of start value + delta_mz <- breaks_segment[2] - breaks_segment[1] + avg_breaks_segment <- breaks_segment + 0.5 * delta_mz + breaks_fwhm_avg <- c(breaks_fwhm_avg, avg_breaks_segment) + } + + # save breaks to file + save(breaks_fwhm, breaks_fwhm_avg, file = "breaks.fwhm.RData") +} diff --git a/DIMS/tests/testthat/fixtures/test_breaks.fwhm.RData b/DIMS/tests/testthat/fixtures/test_breaks.fwhm.RData new file mode 100644 index 00000000..699281d5 Binary files /dev/null and b/DIMS/tests/testthat/fixtures/test_breaks.fwhm.RData differ diff --git a/DIMS/tests/testthat/fixtures/test_trim_params.RData b/DIMS/tests/testthat/fixtures/test_trim_params.RData new file mode 100644 index 00000000..f9370f55 Binary files /dev/null and b/DIMS/tests/testthat/fixtures/test_trim_params.RData differ diff --git a/DIMS/tests/testthat/test_generate_breaks.R b/DIMS/tests/testthat/test_generate_breaks.R new file mode 100644 index 00000000..f6214a45 --- /dev/null +++ b/DIMS/tests/testthat/test_generate_breaks.R @@ -0,0 +1,54 @@ +# unit tests for GenerateBreaks +# functions: get_trim_parameters and get_breaks_for_bins + +# source all functions for GenerateBreaks +source("../../preprocessing/generate_breaks_functions.R") + +# test get_trim_parameters +testthat::test_that("trim parameters are correctly calculated", { + # create list of scan times to test on: + test_scantimes <- seq(0, 100, length = 168) + test_polarities <- c(rep("positive", 84), rep("negative", 84)) + test_trim <- 0.1 + + # test that the function produces no output except trim_params.RData file + expect_silent(get_trim_parameters(test_scantimes, test_polarities, test_trim)) + expect_true(file.exists("./trim_params.RData")) + + # test the parameters in the output RData file + load("./trim_params.RData") + test_trim_left_pos <- trim_left_pos + test_trim_left_neg <- trim_left_neg + test_trim_right_pos <- trim_right_pos + test_trim_right_neg <- trim_right_neg + rm(trim_left_pos, trim_left_neg, trim_right_pos, trim_right_neg) + + # load previously stored values from fixtures + load("fixtures/test_trim_params.RData") + expect_equal(test_trim_left_pos, trim_left_pos) + expect_equal(test_trim_left_neg, trim_left_neg) + expect_equal(test_trim_right_pos, trim_right_pos) + expect_equal(test_trim_right_neg, trim_right_neg) +}) + +# test get_breaks_for_bins +testthat::test_that("breaks are correctly calculated", { + # create list of scan times to test on: + test_mzrange <- c(300, 400) + test_resol <- 140000 + + # test that the function produces no output except breaks.fwhm.RData file + expect_silent(get_breaks_for_bins(test_mzrange, test_resol)) + expect_true(file.exists("./breaks.fwhm.RData")) + + # test the vectors in the output RData file + load("./breaks.fwhm.RData") + test_breaks_fwhm <- breaks_fwhm + test_breaks_fwhm_avg <- breaks_fwhm_avg + rm(breaks_fwhm, breaks_fwhm_avg) + + # load breaks from fixtures and compare vectors + load("fixtures/breaks.fwhm.RData") + expect_equal(test_breaks_fwhm, breaks_fwhm) + expect_equal(test_breaks_fwhm_avg, breaks_fwhm_avg) +})