diff --git a/R/DESCRIPTION b/R/DESCRIPTION index 453e6a3..ec3008e 100644 --- a/R/DESCRIPTION +++ b/R/DESCRIPTION @@ -1,6 +1,6 @@ Package: power.transform Title: Location and Scale Invariant Power Transformations -Version: 1.0.2 +Version: 1.0.3 Authors@R: c( person("Alex", "Zwanenburg", diff --git a/R/NEWS.md b/R/NEWS.md index 3a08a75..01ff76c 100644 --- a/R/NEWS.md +++ b/R/NEWS.md @@ -1,3 +1,11 @@ +# Version 1.0.3 + +## Changes + +- Fixed an error that occurred when assessing if a transformation should be + rejected, but the empirical central normality test p-value could not be + computed. + # Version 1.0.2 ## Changes diff --git a/R/R/FindParameters.R b/R/R/FindParameters.R index 29651dd..a295e71 100644 --- a/R/R/FindParameters.R +++ b/R/R/FindParameters.R @@ -170,7 +170,25 @@ find_transformation_parameters <- function( transformer = object, verbose = FALSE) - if (gof_test_p < empirical_gof_normality_p_value) { + if (is.na(gof_test_p)) { + rlang::warn( + message = paste0( + "The p-value of the transformed data (", gof_test_p, ") could not be determined, likely due to an internal error. ", + "The transformation is rejected, and data are kept as is." + ), + class = "power_transform_no_transform" + ) + + object <- methods::new("transformationNone") + + object <- .set_transformation_parameters( + object = object, + x = x, + lambda = lambda, + ... + ) + + } else if (gof_test_p < empirical_gof_normality_p_value) { rlang::warn( message = paste0( "The p-value of the transformed data (", gof_test_p, ") is below the required ", diff --git a/R/R/GoodnessOfFit.R b/R/R/GoodnessOfFit.R index 23d4675..be82996 100644 --- a/R/R/GoodnessOfFit.R +++ b/R/R/GoodnessOfFit.R @@ -61,7 +61,7 @@ assess_transformation <- function( if (is.na(h$p_value)) { message("p-value could not be determined.") - } else if (h$p_value > 0.0001 && verbose) { + } else if (h$p_value < 0.0001 && verbose) { message("p-value is smaller than 0.0001") } @@ -255,8 +255,8 @@ ecn_test <- function( # Find those elements that are closest to tau_lookup. y <- tau - tau_lookup - x[which(y == min(y[y >= 0.0]))] <- TRUE - x[which(y == max(y[y <= 0.0]))] <- TRUE + if (any(y >= 0.0)) x[which(y == min(y[y >= 0.0]))] <- TRUE + if (any(y <= 0.0)) x[which(y == max(y[y <= 0.0]))] <- TRUE return(x) } diff --git a/R/cran-comments.md b/R/cran-comments.md index 934e805..2598322 100644 --- a/R/cran-comments.md +++ b/R/cran-comments.md @@ -1,11 +1,27 @@ -This is a new release. R CMD was run locally (windows), and using GitHub Actions on Ubuntu, Mac-OS and Windows. +This is a new release. R CMD was run locally (windows), and using GitHub Actions +on Ubuntu, Mac-OS and Windows for R-latest and R-devel. + +Version 1.0.3 prevents the issue leading to the error on +r-devel-linux-x86_64-fedora-gcc for 1.0.2, which was caused by an error in the +data.table package that has been fixed in data.table 1.18.0. Additionally, R CMD was run with `_R_CHECK_DEPENDS_ONLY_ = True` # R CMD check results -0 errors | 0 warnings | 1 note +0 errors | 0 warnings | 2 notes + +❯ checking CRAN incoming feasibility ... [12s] NOTE + Maintainer: 'Alex Zwanenburg ' + + New submission + + Package was archived on CRAN + + CRAN repository db overrides: + X-CRAN-Comment: Archived on 2026-01-15 as issues were not addressed + in time. ❯ checking for future file timestamps ... NOTE unable to verify current time diff --git a/R/tests/testthat/test-goodness-of-fit.R b/R/tests/testthat/test-goodness-of-fit.R index 1ec94b8..1e31f1c 100644 --- a/R/tests/testthat/test-goodness-of-fit.R +++ b/R/tests/testthat/test-goodness-of-fit.R @@ -1,3 +1,23 @@ +# Assess whether test should be skipped due to external factors. +x <- stats::rnorm(n = 100L) + +transformer <- power.transform::find_transformation_parameters( + x = x, + method = "yeo_johnson", + robust = FALSE, + invariant = FALSE, + estimation_method = "mle" +) + +# Compute the p-value. +p_value <- suppressMessages(power.transform::assess_transformation( + x = x, + transformer = transformer +)) + +testthat::skip_if(is.na(p_value)) + + parameter_list <- list() ii <- 1 for (method in c("box_cox", "yeo_johnson", "none")) { diff --git a/manuscript_latex/manuscript.bbl b/manuscript_latex/manuscript.bbl index 92323ea..75835ac 100644 --- a/manuscript_latex/manuscript.bbl +++ b/manuscript_latex/manuscript.bbl @@ -374,6 +374,35 @@ Griffin, M.% \APACaddressPublisher{}{The R Foundation}. \PrintBackRefs{\CurrentBib} +\bibitem [\protect \citeauthoryear {% +Huber% +}{% +Huber% +}{% +{\protect \APACyear {1964}}% +}]{% +Huber1964-xk} +\APACinsertmetastar {% +Huber1964-xk}% +\begin{APACrefauthors}% +Huber, P.J.% +\end{APACrefauthors}% +\unskip\ +\newblock +\APACrefYearMonthDay{1964}{{\APACmonth{03}}}{}. +\newblock +{\BBOQ}\APACrefatitle {Robust estimation of a location parameter} {Robust + estimation of a location parameter}.{\BBCQ} +\newblock +\APACjournalVolNumPages{Ann. Math. Stat.}{35}{1}{73--101,} +\newblock +\begin{APACrefDOI} \doi{10.1214/aoms/1177703732} \end{APACrefDOI} +\newblock + +\newblock + +\PrintBackRefs{\CurrentBib} + \bibitem [\protect \citeauthoryear {% Huber% }{% @@ -647,6 +676,38 @@ Rousseeuw, P.J.% \PrintBackRefs{\CurrentBib} +\bibitem [\protect \citeauthoryear {% +Rousseeuw% +\ \BBA {} Croux% +}{% +Rousseeuw% +\ \BBA {} Croux% +}{% +{\protect \APACyear {1994}}% +}]{% +Rousseeuw1994-ur} +\APACinsertmetastar {% +Rousseeuw1994-ur}% +\begin{APACrefauthors}% +Rousseeuw, P.J.% +\BCBT {}\ \BBA {} Croux, C.% +\end{APACrefauthors}% +\unskip\ +\newblock +\APACrefYearMonthDay{1994}{{\APACmonth{08}}}{}. +\newblock +{\BBOQ}\APACrefatitle {The bias of k-step {M}-estimators} {The bias of k-step + {M}-estimators}.{\BBCQ} +\newblock +\APACjournalVolNumPages{Stat. Probab. Lett.}{20}{5}{411--420,} +\newblock +\begin{APACrefDOI} \doi{10.1016/0167-7152(94)90133-3} \end{APACrefDOI} +\newblock + +\newblock + +\PrintBackRefs{\CurrentBib} + \bibitem [\protect \citeauthoryear {% Shapiro% \ \BBA {} Wilk% @@ -785,6 +846,32 @@ Tukey, J.W.% \APACaddressPublisher{}{Addison-Wesley Publishing Company}. \PrintBackRefs{\CurrentBib} +\bibitem [\protect \citeauthoryear {% +van~der Vaart% +}{% +van~der Vaart% +}{% +{\protect \APACyear {1998}}% +}]{% +van-der-Vaart1998-xu} +\APACinsertmetastar {% +van-der-Vaart1998-xu}% +\begin{APACrefauthors}% +van~der Vaart, A.W.% +\end{APACrefauthors}% +\unskip\ +\newblock +\APACrefYearMonthDay{1998}{{\APACmonth{10}}}{}. +\newblock +{\BBOQ}\APACrefatitle {Quantiles and Order Statistics} {Quantiles and order + statistics}.{\BBCQ} +\newblock + \APACrefbtitle {Asymptotic Statistics} {Asymptotic statistics}\ (\BPGS\ + 304--315). +\newblock +\APACaddressPublisher{Cambridge}{Cambridge University Press}. +\PrintBackRefs{\CurrentBib} + \bibitem [\protect \citeauthoryear {% von Mises% }{% diff --git a/manuscript_latex/manuscript.blg b/manuscript_latex/manuscript.blg new file mode 100644 index 0000000..c706306 --- /dev/null +++ b/manuscript_latex/manuscript.blg @@ -0,0 +1,60 @@ +This is BibTeX, Version 0.99d +Capacity: max_strings=200000, hash_size=200000, hash_prime=170003 +The top-level auxiliary file: manuscript.aux +Reallocating 'name_of_file' (item size: 1) to 11 items. +The style file: sn-apacite.bst +Reallocating 'name_of_file' (item size: 1) to 5 items. +Reallocating 'glb_str_end' (item size: 4) to 20 items. +Reallocating 'glb_str_ptr' (item size: 4) to 20 items. +Reallocating 'global_strs' (item size: 1) to 4000000 items. +Reallocating 'singl_function' (item size: 4) to 100 items. +Reallocating 'wiz_functions' (item size: 4) to 6000 items. +Reallocating 'singl_function' (item size: 4) to 100 items. +Reallocating 'singl_function' (item size: 4) to 100 items. +Reallocating 'wiz_functions' (item size: 4) to 9000 items. +Reallocating 'singl_function' (item size: 4) to 100 items. +Database file #1: refs.bib +apacite.bst [2013/07/21 v6.03 APA bibliography style] +Warning--No address in Powell2009-zb +You've used 34 entries, + 7437 wiz_defined-function locations, + 1636 strings with 17752 characters, +and the built_in function-call counts, 64320 in all, are: += -- 8025 +> -- 493 +< -- 2987 ++ -- 3073 +- -- 69 +* -- 4102 +:= -- 11712 +add.period$ -- 104 +call.type$ -- 34 +change.case$ -- 641 +chr.to.int$ -- 0 +cite$ -- 69 +duplicate$ -- 1659 +empty$ -- 2579 +format.name$ -- 753 +if$ -- 12313 +int.to.chr$ -- 0 +int.to.str$ -- 106 +missing$ -- 0 +newline$ -- 960 +num.names$ -- 274 +pop$ -- 1114 +preamble$ -- 1 +purify$ -- 180 +quote$ -- 0 +skip$ -- 4144 +stack$ -- 0 +substring$ -- 3473 +swap$ -- 980 +text.length$ -- 153 +text.prefix$ -- 0 +top$ -- 1 +type$ -- 2912 +warning$ -- 1 +while$ -- 291 +width$ -- 0 +write$ -- 1117 +(There was 1 warning) diff --git a/manuscript_latex/manuscript.pdf b/manuscript_latex/manuscript.pdf index 16604c3..89f3b3d 100644 Binary files a/manuscript_latex/manuscript.pdf and b/manuscript_latex/manuscript.pdf differ diff --git a/manuscript_latex/manuscript.tex b/manuscript_latex/manuscript.tex index 2a63f73..d0b9716 100644 --- a/manuscript_latex/manuscript.tex +++ b/manuscript_latex/manuscript.tex @@ -197,7 +197,7 @@ \section{Introduction}\label{introduction} The Box-Cox transformation is defined as: \begin{definition}[Box-Cox power transformation] -Let $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i > 0 \right\}$ be a finite sequence of length $n > 0$. +Let $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i > 0 \right\}$ be a vector of length $n > 0$. Let $\lambda \in \mathcal{R}$ be a transformation parameter. Then the Box-Cox power transformation of element $x_i$ is defined as: \begin{equation} @@ -216,7 +216,7 @@ \section{Introduction}\label{introduction} \(x_i \in \mathbb{R}\), as follows: \begin{definition}[Yeo-Johnson power transformation] -Let $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i \in \mathbb{R} \right\}$ be a finite sequence of length $n > 0$. +Let $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i \in \mathbb{R} \right\}$ be a vector of length $n > 0$. Let $\lambda \in \mathcal{R}$ be a transformation parameter. \begin{equation} \label{eqn:yeo-johnson-original} @@ -298,7 +298,7 @@ \section{Theory}\label{theory} otherwise. \begin{definition}[Numeric feature] -A numeric feature is a finite sequence $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i \in \right\}$ of length $n$. +A numeric feature is a vector $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i \in \mathbb{R} \right\}$ of length $n$. \end{definition} \subsection{Location- and scale-invariant power @@ -318,7 +318,7 @@ \subsection{Location- and scale-invariant power \begin{definition}[Location- and scale- invariant Box-Cox power transformation] Let $\lambda \in \mathcal{R}$ be a transformation parameter. Let $x_0 \in \mathcal{R}$ and $s > 0$ be location and scale parameters. -Let $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i - x_0 > 0 \right\}$ be a finite sequence of length $n > 0$. +Let $\mathbf{X}$ be a numeric feature of length $n > 0$. Then the location- and scale-invariant Box-Cox power transformation of element $x_i$ is defined as: @@ -339,7 +339,7 @@ \subsection{Location- and scale-invariant power \begin{definition}[Location- and scale- invariant Yeo-Johnson power transformation] Let $\lambda \in \mathcal{R}$ be a transformation parameter. Let $x_0 \in \mathcal{R}$ and $s > 0$ be location and scale parameters. -Let $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i \in \mathcal{R} \right\}$ be a finite sequence of length $n > 0$. +Let $\mathbf{X}$ be a numeric feature of length $n > 0$. Then the location- and scale-invariant Yeo-Johnson power transformation of element $x_i$ is defined as: @@ -363,7 +363,7 @@ \subsection{Location- and scale-invariant power \begin{definition} [Log-likelihood function of the location- and scale- invariant Box-Cox power transformation] Let $\mathbf{X}$, $\lambda$, $x_0$, and $s$ be defined as earlier (Eqn. \ref{eqn:box-cox-invariant}). -Let $\mu$ and $\sigma^2$ be the mean and variance of the transformed sequence $\phi_{\text{BC}}^{\lambda, x_0, s} (\mathbf{X})$, respectively. +Let $\mu$ and $\sigma^2$ be the mean and variance of the transformed feature $\phi_{\text{BC}}^{\lambda, x_0, s} (\mathbf{X})$, respectively. Then the log-likelihood function of the location- and scale- invariant Box-Cox power transformation is defined as: \begin{equation} @@ -380,7 +380,7 @@ \subsection{Location- and scale-invariant power \begin{definition}[Log-likelihood function of the location- and scale- invariant Yeo-Johnson power transformation] Let $\mathbf{X}$, $\lambda$, $x_0$, and $s$ be defined as earlier (Eqn. \ref{eqn:yeo-johnson-invariant}). -Let $\mu$ and $\sigma^2$ be the mean and variance of the transformed sequence $\phi_{\text{YJ}}^{\lambda, x_0, s} (\mathbf{X})$, respectively. +Let $\mu$ and $\sigma^2$ be the mean and variance of the transformed feature $\phi_{\text{YJ}}^{\lambda, x_0, s} (\mathbf{X})$, respectively. Then the log-likelihood function of the location- and scale- invariant Yeo-Johnson power transformation is defined as: \begin{equation} @@ -424,12 +424,12 @@ \subsection{Robust location- and scale-invariant power \begin{definition}[Weighted log-likelihood function of the location- and scale- invariant Box-Cox power transformation] Let $\mathbf{X}$, $\lambda$, $x_0$, and $s$ be defined as earlier (Eqn. \ref{eqn:box-cox-invariant}). Let $w_i \geq 0$ be the weight corresponding to each element of $\mathbf{X}$. -Let $\mu_w$ be the weighted mean of the Box-Cox transformed sequence: +Let $\mu_w$ be the weighted mean of the Box-Cox transformed feature: \begin{equation*} \mu_w = \frac{\sum_{i=1}^n w_i \phi_{\text{BC}}^{\lambda, x_0, s} (x_i)} {\sum_{i=1}^n w_i} \end{equation*} -Let $\sigma^2_w$ be the weighted variance of the Box-Cox transformed sequence: +Let $\sigma^2_w$ be the weighted variance of the Box-Cox transformed feature: \begin{equation*} \sigma_w^2 = \frac{\sum_{i=1}^n w_i \left(\phi_{\text{BC}}^{\lambda, x_0, s} (x_i) - \mu_w \right)^2}{\sum_{i=1}^n w_i} \end{equation*} @@ -452,12 +452,12 @@ \subsection{Robust location- and scale-invariant power Let $\mathbf{X}$, $\lambda$, $x_0$, and $s$ be defined as earlier (Eqn. \ref{eqn:yeo-johnson-invariant}). Let $w_i \geq 0$ be the weight corresponding to each element of $\mathbf{X}$. -Let $\mu_w$ be the weighted mean of the Yeo-Johnson transformed sequence: +Let $\mu_w$ be the weighted mean of the Yeo-Johnson transformed feature: \begin{equation*} \mu_w = \frac{\sum_{i=1}^n w_i \phi_{\text{YJ}}^{\lambda, x_0, s} (x_i)} {\sum_{i=1}^n w_i} \end{equation*} -Let $\sigma^2_w$ be the weighted variance of the Yeo-Johnson transformed sequence: +Let $\sigma^2_w$ be the weighted variance of the Yeo-Johnson transformed feature: \begin{equation*} \sigma_w^2 = \frac{\sum_{i=1}^n w_i \left(\phi_{\text{YJ}}^{\lambda, x_0, s} (x_i) - \mu_w \right)^2}{\sum_{i=1}^n w_i} \end{equation*} @@ -652,8 +652,8 @@ \subsection{Empirical central normality tests may be too stringent with large sample sizes, outliers, or both. Here we develop an empirical test for central normality. -\begin{definition}[Central portion of a sequence] -Let $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i \in \mathbb{R}\right\}$ be a finite sequence of length $n > 0$, +\begin{definition}[Central portion of a numeric feature] +Let $\mathbf{X}$ be a numeric feature of length $n > 0$, as before, ordered so that $x_1 \leq x_2 \leq \ldots \leq x_n$. Let $p_i = \frac{i - 1/3}{n + 1/3}$, with $i = 1, 2, \ldots n$ be the percentile value corresponding to each element in $\mathbf{X}$. For elements with tied values in $\mathbf{X}$, @@ -667,7 +667,7 @@ \subsection{Empirical central normality \end{definition} -\begin{definition}[Residual errors of the central portion of a sequence] +\begin{definition}[Residual errors of the central portion of a feature] Let the residual error for each element of $\mathbf{X}$ be $r_i =\left| \frac{x_i - \mu_M}{\sigma_M} - F^{-1}_{\mathcal{N}}(p_i) \right|$, with $\mu_M$ and $\sigma_M$ robust Huber M-estimates of location and scale of $\mathbf{X}$ \citep{Huber1981-su}, @@ -680,31 +680,55 @@ \subsection{Empirical central normality \begin{definition}[Central normal distribution] -A central normal distribution is any distribution whose quantile function is -defined by $F^{-1}_{\mathcal{N}}$ for $\frac{1-\kappa}{2} \leq p \leq \frac{1 + \kappa}{2}$, +A central normal distribution is any distribution whose quantile function +satisfies $F^{-1}_{\mathcal{N}}$ for $\frac{1-\kappa}{2} \leq p \leq \frac{1 + \kappa}{2}$, with $\mathcal{N}$ any normal distribution. \end{definition} \begin{corollary} -For $\kappa = 1.0$, the central normal distribution is a normal distribution. +A normal distribution is a central normal distribution. \end{corollary} \begin{corollary} -By the Glivenko-Cantelli theorem, if a sequence $\mathbf{X}$ was sampled from a -central normal distribution, then almost surely -$\lim_{n \rightarrow \infty} \sum_{r_j \in \mathbf{R}_{\text{c}}} r_j = 0$, +For $\kappa = 1.0$, a central normal distribution is a normal distribution. \end{corollary} +\begin{corollary} + +If $\mathbf{X}$ is randomly sampled from a normal distribution, +$\sup_{r_j \in \mathbf{R}_{\text{c}}} r_j \rightarrow 0$, almost surely. +That is, every residual of the central portion of $\mathbf{X}$ will asymptotically +converge to 0. + +\end{corollary} + +The latter corollary follows from the definition of the residuals and +the restriction to normal distributions. Residuals were defined using +\(\frac{x_i - \mu_M}{\sigma_M} \approx \hat{F}^{-1}_{\mathcal{N}}(p_i)\), +with \(\hat{F}^{-1}_{\mathcal{N}}(p_i)\) the empirical quantile function +resembling the quantile function of the normal distribution +\(\mathcal{N}(0, 1)\). Since +\(\hat{F}^{-1}_{\mathcal{N}}(p) \rightarrow F^{-1}_{\mathcal{N}}(p)\) +with \(\frac{1-\kappa}{2} \leq p \leq \frac{1 + \kappa}{2}\) +\citep{van-der-Vaart1998-xu}, +\(\sup_{r_j \in \mathbf{R}_{\text{c}}} r_j \rightarrow 0\), if and only +if for all \(x_i \in \mathbf{X}_c\), +\(\frac{x_i - \mu_M}{\sigma_M} \rightarrow \hat{F}^{-1}_{\mathcal{N}}(p_i)\). +Thus, the requirement is that the estimators for location and scale +\(\mu_M\) and \(\sigma_M\) are bias-free. M-estimators for location and +scale satisfy this condition, if \(\mathbf{X}\) follows a normal +distribution without contamination \citep{Huber1964-xk, Huber1981-su}. + \begin{definition}[Central normality test] -The null-hypothesis $\mathcal{H}_0$ is that the sequence $\mathbf{X}$ was sampled +The null-hypothesis $\mathcal{H}_0$ is that $\mathbf{X}$ was randomly sampled from a population that is distributed according to a central normal distribution. -The alternative hypothesis is that the sequence $\mathbf{X}$ was sampled from a +The alternative hypothesis is that $\mathbf{X}$ was randomly sampled from a population that is not distributed according to a central normal distribution. \end{definition} @@ -746,8 +770,8 @@ \subsection{Invariance to location and To assess whether the proposed power transformations lead to values of \(\lambda\) that are invariant to location and scale of the -distribution, we simulated three different sequences. We first randomly -drew \(10000\) values from a normal distribution: +distribution, we simulated three different numeric features. We first +randomly drew \(10000\) values from a normal distribution: \(\mathbf{X}_{\text{normal}} = \left\{x_1, x_2, \ldots, x_{10000} \right\} \sim \mathcal{N}\left(0, 1\right)\), or equivalently \(\mathbf{X}_{\text{normal}} = \left\{x_1, x_2, \ldots, x_{10000} \right\} \sim \mathcal{AGN}\left(0, 1/\sqrt{2}, 0.5, 2\right)\). @@ -785,15 +809,15 @@ \subsection{Invariance to location and \subsection{Empirical central normality test}\label{simulation-ecn-test} The critical test statistic for the empirical central normality test -depends on the significance level \(\alpha\), sequence length \(n\) and +depends on the significance level \(\alpha\), feature length \(n\) and central portion \(\kappa\). Critical test statistic values were set through simulation, as described below. For each number of samples \(n \in \left\{\lfloor 10^\nu \rfloor | \nu \in \left\{0.7500, 0.8125, \ldots, 4.0000 \right \} \right\}\) -we randomly sampled \(m_d = 30000\) ordered sequences \(\mathbf{X}\) -from \(\mathcal{N}(0,1)\). For each sequence we then computed -\(\tau_{\kappa}\) for +we randomly sampled \(m_d = 30000\) ordered numeric features +\(\mathbf{X}\) from \(\mathcal{N}(0,1)\). For each feature we then +computed \(\tau_{\kappa}\) for \(\kappa \in \left\{0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 1.00\right\}\). Figure \ref{fig:empirical-central-normality-combined} shows the critical @@ -813,23 +837,23 @@ \subsection{Empirical central normality test}\label{simulation-ecn-test} To assess type I error rates for the empirical central normality test and compare these to the Shapiro-Wilk test for normality, we randomly -drew another 10000 sequences from a central normal distribution for each +drew another 10000 numeric features from a central normal distribution +for each \(n \in \left\{\lfloor 10^\nu \rfloor | \nu \in \left\{0.7500, 0.8125, \ldots, 3.0000 \right \} \right\}\). -Each sequence was created as follows: first \(n\) values are sampled -from \(\mathcal{N}(0, 1)\). Then, the elements outside the central -portion of the sequence -(i.e.~\(\mathbf{X} \setminus \mathbf{X}_{\text{c}}\)) are replaced. -Elements with \(p_i < \frac{1-\kappa}{2}\) are replaced with values -sampled from \(\mathcal{U}(-10,\min(\mathbf{X}_{\text{c}}))\), and -elements with \(p_i > \frac{1 + \kappa}{2}\) are replaced with values -sampled from \(\mathcal{U}(\max(\mathbf{X}_{\text{c}}), 10)\). - -For each sequence, the p-value for the respective test was used to -reject the null hypothesis that sequences were derived from a population -with a (central) normal distribution. The null hypothesis was rejected -if \(p \leq 0.05\). The type I error rate was determined by computing -the fraction of sequences rejected this way. Figure +Each feature was created as follows: first \(n\) values are sampled from +\(\mathcal{N}(0, 1)\). Then, the elements outside the central portion of +the feature (i.e.~\(\mathbf{X} \setminus \mathbf{X}_{\text{c}}\)) are +replaced. Elements with \(p_i < \frac{1-\kappa}{2}\) are replaced with +values sampled from \(\mathcal{U}(-10,\min(\mathbf{X}_{\text{c}}))\), +and elements with \(p_i > \frac{1 + \kappa}{2}\) are replaced with +values sampled from \(\mathcal{U}(\max(\mathbf{X}_{\text{c}}), 10)\). + +For each feature, the p-value for the respective test was used to reject +the null hypothesis that features were derived from a population with a +(central) normal distribution. The null hypothesis was rejected if +\(p \leq 0.05\). The type I error rate was determined by computing the +fraction of features rejected this way. Figure \ref{fig:empirical-central-normality-error_rate} shows that when the central portion \(\kappa\) of the central normal distribution is equal or greater than the \(\kappa\) parameter for the empirical central @@ -887,19 +911,18 @@ \subsection{Robust transformations}\label{robust-transformations} normal distribution. To determine the weighting function parameters for each of the nine -combinations, \(m_d=500\) sequences \(\mathbf{X}_i\) +combinations, \(m_d=500\) numeric features \(\mathbf{X}_i\) (\(i \in \{1, 2, \ldots, m_d\}\)) were randomly drawn from randomly parametrised asymmetric generalised normal distributions. Each distribution was parametrised with a random skewness parameter \(\alpha \sim U\left(0.01, 0.99\right)\) and shape parameter \(\beta \sim U\left(1.00, 5.00 \right)\). Location and scale parameters were set as \(\mu = 0\) and \(\sigma = 1\), respectively. To form each -sequence \(\mathbf{X}_i\), \(n = \lceil 10^\gamma \rceil\) instances -were then randomly drawn, with -\(\gamma \sim U\left(\log_{10}50, 3\right)\), resulting in a set of -sequences with between \(50\) and \(1000\) elements each. Outlier values -were then drawn to randomly replace 10 percent of the elements of -\(\mathbf{X}_i\). These values were set according to +feature \(\mathbf{X}_i\), \(n = \lceil 10^\gamma \rceil\) instances were +then randomly drawn, with \(\gamma \sim U\left(\log_{10}50, 3\right)\), +resulting in a set of features with between \(50\) and \(1000\) elements +each. Outlier values were then drawn to randomly replace 10 percent of +the elements of \(\mathbf{X}_i\). These values were set according to \citet{Tukey1977-xm}, as follows. Let \(x^{*} \sim U\left(-2, 2\right)\). Then the corresponding outlier value was: @@ -922,16 +945,16 @@ \subsection{Robust transformations}\label{robust-transformations} \(L = \sum_{i=1}^{m_d} L_{\text{cn},i} + 0.1 \sum_{i=1}^{m_d} L_{\lambda,i}\). The composite loss consisted of two components: a loss term \(L_{\text{cn},i} = \tau_{n_i, \kappa = 0.80,i}\), i.e.~the mean of -residual errors of the central 80\% of elements of each sequence +residual errors of the central 80\% of elements of each feature \(\mathbf{X}_i\), which aimed at optimising central normality; and a loss term \(L_{\lambda,i} = \max\left(0.0,|\lambda_{0,i} - \lambda_{i}| - \xi \right)\), with \(\lambda_{0,i}\) and \(\lambda_{i}\) transformation parameters -found for sequence \(\mathbf{X}_i\) prior to and after adding outliers, +found for feature \(\mathbf{X}_i\) prior to and after adding outliers, respectively, and tolerance parameter \(\xi = 0.5\) for Box-Cox and \(\xi = 0.3\) for Yeo-Johnson power transformations. This second term aimed to prevent solutions that provide small improvements in central -normality at the cost of a poor fit of the tails of sequences. +normality at the cost of a poor fit of the tails of features. Minimisation was conducted using the BOBYQA algorithm for derivative-free bound constraint optimisation \citep{Powell2009-zb}. The @@ -1009,40 +1032,47 @@ \subsection{Robust transformations}\label{robust-transformations} \subsection{Assessing transformations using simulated data}\label{assessing-transformations-using-simulated-data} -Three datasets with 100000 sequences each were created to assess -transformation to normality. For each sequence \(\mathbf{X}_i\), +Three datasets with 100000 numeric features each were created to assess +transformation to normality. For each feature \(\mathbf{X}_i\), \(n = \lceil 10^\gamma \rceil\) elements were randomly drawn, with -\(\gamma \sim U\left(1, 4\right)\), resulting in sequences with between +\(\gamma \sim U\left(1, 4\right)\), resulting in features with between \(10\) and \(10000\) elements. \begin{itemize} \item - A \textit{clean} dataset with each sequence drawn from + A \textit{clean} dataset with each feature drawn from \(\mathcal{N}(0, 1)\) and transformed using an inverse power transformation with randomly drawn transformation parameter \(\lambda \sim U\left(0.00, 2.00 \right)\). \(\left(\phi_{\text{BC}}^\lambda\right)^{-1}\) and \(\left(\phi_{\text{YJ}}^\lambda\right)^{-1}\) were used as inverse transformations for assessing Box-Cox and Yeo-Johnson transformations, - respectively. Prior to inverse transformation, sequences for Box-Cox + respectively. Prior to inverse transformation, features for Box-Cox transformations were shifted into the positive domain, with minimum value \(1\). \item - A \textit{dirty} dataset with each sequence drawn from + A \textit{dirty} dataset with each feature drawn from \(\mathcal{AGN}\left(\mu = 0, \sigma = 1/\sqrt{2}, \alpha, \beta \right)\), with randomly drawn skewness parameter \(\alpha \sim U\left(0.01, 0.99\right)\) and shape parameter - \(\beta \sim U\left(1.00, 5.00 \right)\). + \(\beta \sim U\left(1.00, 5.00 \right)\). The features are drawn from + a range of asymmetric unimodal distributions to reflect a wider range + of generating distributions than for the data presented in the first + dataset. \item - A \textit{shifted} dataset with each sequence drawn from - \(\mathcal{AGN}\left(\mu = 100, \sigma = 10^{-3} \cdot 1/\sqrt{2} , \alpha, \beta \right)\), + A \textit{shifted} dataset with each feature drawn from + \(\mathcal{AGN}\left(\mu = 1000, \sigma = 1/\sqrt{2} , \alpha, \beta \right)\), with randomly drawn skewness parameter \(\alpha \sim U\left(0.01, 0.99\right)\) and shape parameter - \(\beta \sim U\left(1.00, 5.00 \right)\). + \(\beta \sim U\left(1.00, 5.00 \right)\). The generating process + creates features similar to those in the \textit{dirty} dataset, but + shifted to a location that leads to normalisation issues observed for + conventional power transformation methods, as shown in Figure + \ref{fig:decreased-normality}. \end{itemize} A dataset with outliers was created for each of the above datasets by -replacing 10 percent of elements in each sequence, as described earlier. +replacing 10 percent of elements in each feature, as described earlier. In addition to no power transformation and location- and scale-invariant power transformations, conventional and Raymaekers and Rousseeuw's @@ -1055,8 +1085,7 @@ \subsection{Assessing transformations using simulated \item z-standardisation: \(x^{\prime}_{i} = \left(x_i - \mu \right) / \sigma\), with \(\mu\) - and \(\sigma\) the mean and standard deviation of sequence - \(\mathbf{X}\). + and \(\sigma\) the mean and standard deviation of \(\mathbf{X}\). \item robust scaling: \(x^{\prime}_{i} = \left(x_i - \text{median}\left(\mathbf{X}\right) \right) / Q_n\left(\mathbf{X}\right)\), @@ -1065,15 +1094,15 @@ \subsection{Assessing transformations using simulated \end{enumerate} This results in nine types of power transformation. Transformation -parameters were optimised for each sequence. Subsequently the sum of +parameters were optimised for each feature. Subsequently the sum of residual errors and sum of residual errors of the central portion -(\(\kappa = 0.80\)) were computed for each sequence after -transformation. Then, each method was ranked according to the sum of -residual errors (data without outliers) or the sum of residual errors of -the central portion (data with outliers). The average rank of each -transformation method was computed over all 100000 sequences in each -dataset. Average ranks for Yeo-Johnson transformations are shown in -Table \ref{tab:comparison_methods_simulations_yeo_johnson}. Location and +(\(\kappa = 0.80\)) were computed for each feature after transformation. +Then, each method was ranked according to the sum of residual errors +(data without outliers) or the sum of residual errors of the central +portion (data with outliers). The average rank of each transformation +method was computed over all 100000 features in each dataset. Average +ranks for Yeo-Johnson transformations are shown in Table +\ref{tab:comparison_methods_simulations_yeo_johnson}. Location and shift-invariant Yeo-Johnson transformation ranked best for all datasets without outliers. The robust variant ranked best in dirty and shifted datasets with outliers, but not in the clean dataset. Results for @@ -1083,9 +1112,9 @@ \subsection{Assessing transformations using simulated \begin{table} \caption{ Comparison of average rank between Yeo-Johnson transformation methods based on either residual error (without outliers) or residual error of the central portion -(with outliers; $\kappa = 0.80$) over 3 datasets with 100000 sequences each. The clean dataset consists of sequences derived through inverse Yeo-Johnson transformation -of data sampled from a standard normal distribution. The dirty dataset contains sequences sampled from asymmetric generalised normal distributions, centred at 0. -The shifted dataset also contains sequences sampled from asymmetric generalised normal distributions, but centred at 100, and scaled by 0.001. +(with outliers; $\kappa = 0.80$) over 3 datasets with 100000 features each. The clean dataset consists of features derived through inverse Yeo-Johnson transformation +of data sampled from a standard normal distribution. The dirty dataset contains features sampled from asymmetric generalised normal distributions, centred at 0. +The shifted dataset also contains features sampled from asymmetric generalised normal distributions, but centred at 1000. Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. A rank of 1 is the best and a rank of 9 the worst. For each dataset, the best ranking transformation is marked in bold. } @@ -1098,15 +1127,15 @@ \subsection{Assessing transformations using simulated \midrule -none & & 8.52 & 7.62 & 7.35 & 6.17 & 7.46 & 6.62 \\ -conventional & & 3.82 & 5.78 & 3.86 & 7.08 & 6.51 & 5.95 \\ -conventional (z-score norm.) & & 4.70 & 5.75 & 6.44 & 5.07 & 5.48 & 4.93 \\ -conventional (robust scaling) & & 3.83 & 6.61 & 5.36 & 5.72 & 4.41 & 5.59 \\ -Raymaekers-Rousseeuw & & 4.64 & 3.22 & 4.01 & 4.11 & 6.28 & 5.67 \\ -Raymaekers-Rousseeuw (z-score norm.) & & 5.24 & \textbf{2.66} & 6.25 & 3.62 & 5.29 & 3.50 \\ -Raymaekers-Rousseeuw (robust scaling) & & 4.70 & 2.99 & 5.22 & 3.69 & 4.27 & 3.57 \\ -invariant & & \textbf{3.11} & 6.74 & \textbf{3.24} & 6.13 & \textbf{2.63} & 6.03 \\ -robust invariant & & 6.44 & 3.64 & 3.27 & \textbf{3.40} & 2.65 & \textbf{3.14} \\ +none & & 8.52 & 7.62 & 7.35 & 6.17 & 7.45 & 6.61 \\ +conventional & & 3.82 & 5.78 & 3.86 & 7.08 & 6.34 & 6.20 \\ +conventional (z-score norm.) & & 4.70 & 5.75 & 6.44 & 5.07 & 5.50 & 4.99 \\ +conventional (robust scaling) & & 3.83 & 6.61 & 5.36 & 5.72 & 4.43 & 5.67 \\ +Raymaekers-Rousseeuw & & 4.64 & 3.22 & 4.01 & 4.11 & 6.36 & 5.22 \\ +Raymaekers-Rousseeuw (z-score norm.) & & 5.24 & \textbf{2.66} & 6.25 & 3.62 & 5.31 & 3.53 \\ +Raymaekers-Rousseeuw (robust scaling) & & 4.70 & 2.99 & 5.22 & 3.69 & 4.29 & 3.59 \\ +invariant & & \textbf{3.11} & 6.74 & \textbf{3.24} & 6.13 & \textbf{2.64} & 6.05 \\ +robust invariant & & 6.44 & 3.64 & 3.27 & \textbf{3.40} & 2.66 & \textbf{3.14} \\ \bottomrule \end{tabular} @@ -1457,7 +1486,7 @@ \section{Discussion}\label{discussion} not suffer from this issue, as weights remained fixed during MLE. We introduced an empirical test for central normality to assess whether -sequences deviate from normality in a way that might require closer +features deviate from normality in a way that might require closer inspection prior to further processing. The empirical central normality test differs from other tests for normality, such as the Shapiro-Wilk test \citep{Shapiro1965-zd}, as it assesses normality of the central @@ -1468,11 +1497,16 @@ \section{Discussion}\label{discussion} enables assessing reasonable normality of (transformed) features in the presence of outliers. -This work has the following limitation: We observed several numerical +This work has the following limitations: We observed several numerical stability issues for optimisation criteria other than MLE (\hyperref[appendix-b]{Appendix B}). These appear in regions where transformation parameters would lead to very large or small numbers when using conventional power -transformations. For MLE stability issues were not observed. +transformations. For MLE stability issues were not observed. Moreover, +the definition of central normal distribution does not constrain the +distribution of the non-central tails. Thus, the test may be +conservative for central normal distributions other than uncontaminated +normal distributions, because the location and scale estimators are not +generally bias-free \citep{Rousseeuw1994-ur}. \section{Conclusion}\label{conclusion} @@ -1484,6 +1518,7 @@ \section{Conclusion}\label{conclusion} both facilitate the use of power transformations in automated data analysis workflows. + \backmatter \section*{Declarations} @@ -2027,10 +2062,10 @@ \subsection{Ranking Box-Cox \begin{table} \caption{ Comparison of average rank between Box-Cox transformation methods based on either residual error (without outliers) or residual error of the central portion -(with outliers; $\kappa = 0.80$) over 3 datasets with 100000 sequences each. The clean dataset consists of sequences derived through inverse Box-Cox transformation -of data sampled from a standard normal distribution. The dirty dataset contains sequences sampled from asymmetric generalised normal distributions. -The shifted dataset also contains sequences sampled from asymmetric generalised normal distributions, but centred at 100, and scaled by 0.001. -If necessary, each sequence was shifted so that every element had a strictly positive value. +(with outliers; $\kappa = 0.80$) over 3 datasets with 100000 features each. The clean dataset consists of features derived through inverse Box-Cox transformation +of data sampled from a standard normal distribution. The dirty dataset contains features sampled from asymmetric generalised normal distributions. +The shifted dataset also contains features sampled from asymmetric generalised normal distributions, but centred at 1000. +If necessary, each feature was shifted so that every element had a strictly positive value. Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. A rank of 1 is the best and a rank of 9 the worst. For each dataset, the best ranking transformation is marked in bold. } @@ -2043,15 +2078,15 @@ \subsection{Ranking Box-Cox \midrule -none & & 7.63 & 6.95 & 7.29 & 6.59 & 7.47 & 6.78 \\ -conventional & & 3.11 & 5.98 & 5.41 & 6.18 & 6.51 & 6.10 \\ -conventional (z-score norm.) & & 4.15 & 6.27 & 4.83 & 6.32 & 4.34 & 5.84 \\ -conventional (robust scaling) & & 4.20 & 6.29 & 4.84 & 6.31 & 4.35 & 5.84 \\ -Raymaekers-Rousseeuw & & 4.50 & 3.56 & 5.39 & 3.69 & 6.30 & 5.80 \\ -Raymaekers-Rousseeuw (z-score norm.) & & 6.23 & 3.80 & 4.93 & 3.85 & 4.48 & 3.50 \\ -Raymaekers-Rousseeuw (robust scaling) & & 6.12 & 3.80 & 4.99 & 3.87 & 4.53 & 3.51 \\ -invariant & & \textbf{2.96} & 5.59 & \textbf{3.62} & 6.01 & \textbf{3.51} & 5.58 \\ -robust invariant & & 6.10 & \textbf{2.76} & 3.68 & \textbf{2.19} & \textbf{3.51} & \textbf{2.00} \\ +none & & 7.63 & 6.95 & 7.29 & 6.59 & 7.46 & 6.77 \\ +conventional & & 3.11 & 5.98 & 5.41 & 6.18 & 6.51 & 6.26 \\ +conventional (z-score norm.) & & 4.15 & 6.27 & 4.83 & 6.32 & 4.39 & 5.96 \\ +conventional (robust scaling) & & 4.20 & 6.29 & 4.84 & 6.31 & 4.40 & 5.96 \\ +Raymaekers-Rousseeuw & & 4.50 & 3.56 & 5.39 & 3.69 & 6.29 & 5.35 \\ +Raymaekers-Rousseeuw (z-score norm.) & & 6.23 & 3.80 & 4.93 & 3.85 & 4.53 & 3.51 \\ +Raymaekers-Rousseeuw (robust scaling) & & 6.12 & 3.80 & 4.99 & 3.87 & 4.58 & 3.52 \\ +invariant & & \textbf{2.96} & 5.59 & \textbf{3.62} & 6.01 & \textbf{3.53} & 5.67 \\ +robust invariant & & 6.10 & \textbf{2.76} & 3.68 & \textbf{2.19} & 3.55 & \textbf{2.00} \\ \bottomrule \end{tabular} @@ -2063,26 +2098,24 @@ \subsection{Ranking Yeo-Johnson transformations: residual errors}\label{ranking-yeo-johnson-transformations-residual-errors} In the main manuscript, transformation methods are compared by ranking -within each experiment, defined by the combination of sequence and +within each experiment, defined by the combination of feature and absence/presence of outliers. This allows for direct comparison of methods between different experiments. Table \ref{tab:comparison_methods_simulations_yeo_johnson_avg_residual} shows the average normalised residual error for these transformation methods. This metric is computed from the residual error and the -residual error of the central portion (\(\kappa = 0.80\)) for sequences +residual error of the central portion (\(\kappa = 0.80\)) for features without and with outliers, respectively. In each experiment, the -respective residual error was normalised by dividing with the sequence +respective residual error was normalised by dividing with the feature length, and can be interpreted as the average residual error of a single -sample. This reduces the effect of sequence length on the residual -error. +sample. This reduces the effect of feature length on the residual error. Table \ref{tab:comparison_methods_simulations_yeo_johnson_avg_residual_diff} shows the average normalised difference in residual error between the best method for each experiment and the individual methods. As above, -normalisation was done by division by the length of the sequence -corresponding to each experiment. +normalisation was done by division by the feature length of each experiment. Both tables show that the differences between best-performing methods on each dataset are relatively small. @@ -2090,10 +2123,10 @@ \subsection{Ranking Yeo-Johnson transformations: residual \begin{table} \caption{ Comparison of average normalised residual error between Yeo-Johnson transformation methods based on either the residual error (without outliers) or the residual error of the central portion -(with outliers; $\kappa = 0.80$) over 3 datasets with 100000 sequences each. Residual errors were normalised by sequence length to reduce the effect of sequence length on the residual error. -The clean dataset consists of sequences derived through inverse Yeo-Johnson transformation -of data sampled from a standard normal distribution. The dirty dataset contains sequences sampled from asymmetric generalised normal distributions, centred at 0. -The shifted dataset also contains sequences sampled from asymmetric generalised normal distributions, but centred at 100, and scaled by 0.001. +(with outliers; $\kappa = 0.80$) over 3 datasets with 100000 features each. Residual errors were normalised by feature length to reduce the effect of feature length on the residual error. +The clean dataset consists of features derived through inverse Yeo-Johnson transformation +of data sampled from a standard normal distribution. The dirty dataset contains features sampled from asymmetric generalised normal distributions, centred at 0. +The shifted dataset also contains features sampled from asymmetric generalised normal distributions, but centred at 1000. Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. A lower normalised residual error is better. Discrepancies with the rank-based method may occur, as even with normalisation residual errors are not fully comparable between experiments, whereas the ranks achieved within each experiment are. @@ -2108,10 +2141,10 @@ \subsection{Ranking Yeo-Johnson transformations: residual \midrule none & & 0.202 & 0.112 & 0.128 & 0.080 & 0.129 & 0.080 \\ -conventional & & 0.058 & 0.094 & 0.085 & 0.078 & 0.129 & 0.080 \\ +conventional & & 0.058 & 0.094 & 0.085 & 0.078 & 0.126 & 0.080 \\ conventional (z-score norm.) & & 0.061 & 0.094 & 0.091 & 0.073 & 0.091 & 0.073 \\ conventional (robust scaling) & & 0.058 & 0.094 & 0.090 & 0.073 & 0.090 & 0.073 \\ -Raymaekers-Rousseeuw & & 0.071 & 0.055 & 0.092 & 0.279 & 0.129 & 0.080 \\ +Raymaekers-Rousseeuw & & 0.071 & 0.055 & 0.092 & 0.279 & 0.126 & 0.079 \\ Raymaekers-Rousseeuw (z-score norm.) & & 0.087 & 0.057 & 0.097 & 0.060 & 0.097 & 0.060 \\ Raymaekers-Rousseeuw (robust scaling) & & 0.071 & 0.056 & 0.095 & 0.059 & 0.095 & 0.059 \\ invariant & & 0.057 & 0.099 & 0.083 & 0.075 & 0.083 & 0.075 \\ @@ -2125,16 +2158,16 @@ \subsection{Ranking Yeo-Johnson transformations: residual \caption{ Comparison of average difference in residual error between Yeo-Johnson transformation methods and the best transformation method within each -experiment, normalised by the corresponding sequence length. This is based on +experiment, normalised by the corresponding feature length. This is based on either the residual error (without outliers) or the residual error of the central portion (with outliers; $\kappa = 0.80$) over 3 datasets with 100000 -sequences each. Residual errors were normalised by sequence length to reduce the -effect of sequence length on the residual error. The clean dataset consists of -sequences derived through inverse Yeo-Johnson transformation of data sampled -from a standard normal distribution. The dirty dataset contains sequences +features each. Residual errors were normalised by feature length to reduce the +effect of feature length on the residual error. The clean dataset consists of +features derived through inverse Yeo-Johnson transformation of data sampled +from a standard normal distribution. The dirty dataset contains features sampled from asymmetric generalised normal distributions, centred at 0. The -shifted dataset also contains sequences sampled from asymmetric generalised -normal distributions, but centred at 100, and scaled by 0.001. Several +shifted dataset also contains features sampled from asymmetric generalised +normal distributions, but centred at 1000. Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. A lower difference is better, with a minimum value of 0.0. Discrepancies with the rank-based method may occur, @@ -2151,10 +2184,10 @@ \subsection{Ranking Yeo-Johnson transformations: residual \midrule none & & 0.150 & 0.068 & 0.050 & 0.033 & 0.049 & 0.033 \\ -conventional & & 0.005 & 0.049 & 0.006 & 0.032 & 0.049 & 0.033 \\ +conventional & & 0.005 & 0.049 & 0.006 & 0.032 & 0.046 & 0.032 \\ conventional (z-score norm.) & & 0.008 & 0.049 & 0.013 & 0.026 & 0.011 & 0.025 \\ conventional (robust scaling) & & 0.005 & 0.050 & 0.012 & 0.026 & 0.010 & 0.026 \\ -Raymaekers-Rousseeuw & & 0.018 & 0.011 & 0.014 & 0.233 & 0.049 & 0.033 \\ +Raymaekers-Rousseeuw & & 0.018 & 0.011 & 0.014 & 0.233 & 0.046 & 0.031 \\ Raymaekers-Rousseeuw (z-score norm.) & & 0.034 & 0.012 & 0.018 & 0.014 & 0.017 & 0.013 \\ Raymaekers-Rousseeuw (robust scaling) & & 0.018 & 0.011 & 0.017 & 0.012 & 0.015 & 0.011 \\ invariant & & 0.004 & 0.054 & 0.005 & 0.029 & 0.003 & 0.028 \\ @@ -2166,16 +2199,16 @@ \subsection{Ranking Yeo-Johnson transformations: residual \FloatBarrier -\subsection{Ranking Yeo-Johnson transformations: effect of sequence -length}\label{ranking-yeo-johnson-transformations-effect-of-sequence-length} +\subsection{Ranking Yeo-Johnson transformations: effect of feature +length}\label{ranking-yeo-johnson-transformations-effect-of-feature-length} The main manuscript shows the overall ranking of different Yeo-Johnson-based power transformation methods across three sets of -problems. To assess the effect of limited sequence lengths, we perform +problems. To assess the effect of limited feature lengths, we perform ranking on subsets of the experiment. The results for ranking the subset -of sequences with 30 samples or fewer (\(n = 16001\)) is shown in Table +of features with 30 samples or fewer (\(n = 16001\)) is shown in Table \ref{tab:comparison_methods_simulations_yeo_johnson_10_30}. In the clean -dataset without outliers, where all sequences were sampled from a normal +dataset without outliers, where all features were sampled from a normal distribution and then underwent an inverse Yeo-Johnson transformation, conventional and location- and scale-invariant are ranked similarly, with no clear advantage to any method. For the remaining datasets, the @@ -2185,8 +2218,8 @@ \subsection{Ranking Yeo-Johnson transformations: effect of sequence \citep{Raymaekers2024-zf}, in datasets without and with outliers, respectively. -In the subset for larger sequence lengths, with at least 1000 samples -per sequence (\(n = 33037\)), the cost of making the location- and +In the subset for larger feature lengths, with at least 1000 samples per +feature (\(n = 33037\)), the cost of making the location- and scale-invariant transformation robust against outliers becomes more apparent (Table \ref{tab:comparison_methods_simulations_yeo_johnson_1000_10000}). In the @@ -2198,10 +2231,10 @@ \subsection{Ranking Yeo-Johnson transformations: effect of sequence \begin{table} \caption{ Comparison of average rank between Yeo-Johnson transformation methods based on either residual error (without outliers) or residual error of the central portion -(with outliers; $\kappa = 0.80$) over 3 datasets with 16001 sequences each. Each sequence contained 30 samples or fewer. -The clean dataset consists of sequences derived through inverse Yeo-Johnson transformation -of data sampled from a standard normal distribution. The dirty dataset contains sequences sampled from asymmetric generalised normal distributions, centred at 0. -The shifted dataset also contains sequences sampled from asymmetric generalised normal distributions, but centred at 100, and scaled by 0.001. +(with outliers; $\kappa = 0.80$) over 3 datasets with 16001 features each. Each feature contained 30 samples or fewer. +The clean dataset consists of features derived through inverse Yeo-Johnson transformation +of data sampled from a standard normal distribution. The dirty dataset contains features sampled from asymmetric generalised normal distributions, centred at 0. +The shifted dataset also contains features sampled from asymmetric generalised normal distributions, but centred at 1000. Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. A rank of 1 is the best and a rank of 9 the worst. For each dataset, the best ranking transformation is marked in bold. } @@ -2214,15 +2247,15 @@ \subsection{Ranking Yeo-Johnson transformations: effect of sequence \midrule -none & & 7.75 & 6.81 & 7.27 & 6.40 & 7.45 & 6.60 \\ -conventional & & 4.19 & 5.47 & 4.48 & 5.78 & 6.55 & 5.94 \\ -conventional (z-score norm.) & & 4.36 & 5.12 & 5.01 & 4.99 & 4.35 & 4.70 \\ -conventional (robust scaling) & & \textbf{4.05} & 5.63 & 4.38 & 5.49 & 3.76 & 5.20 \\ -Raymaekers-Rousseeuw & & 5.36 & 4.19 & 5.14 & 4.62 & 6.32 & 5.83 \\ -Raymaekers-Rousseeuw (z-score norm.) & & 5.56 & 4.08 & 5.67 & 4.05 & 5.03 & 3.81 \\ -Raymaekers-Rousseeuw (robust scaling) & & 5.33 & 4.14 & 5.15 & 4.15 & 4.55 & 3.94 \\ -invariant & & 4.11 & 5.92 & 4.12 & 5.67 & 3.62 & 5.37 \\ -robust invariant & & 4.28 & \textbf{3.65} & \textbf{3.79} & \textbf{3.85} & \textbf{3.37} & \textbf{3.61} \\ +none & & 7.75 & 6.81 & 7.27 & 6.40 & 7.44 & 6.59 \\ +conventional & & 4.19 & 5.47 & 4.48 & 5.78 & 6.35 & 6.00 \\ +conventional (z-score norm.) & & 4.36 & 5.12 & 5.01 & 4.99 & 4.36 & 4.71 \\ +conventional (robust scaling) & & \textbf{4.05} & 5.63 & 4.38 & 5.49 & 3.77 & 5.21 \\ +Raymaekers-Rousseeuw & & 5.36 & 4.19 & 5.14 & 4.62 & 6.47 & 5.72 \\ +Raymaekers-Rousseeuw (z-score norm.) & & 5.56 & 4.08 & 5.67 & 4.05 & 5.04 & 3.82 \\ +Raymaekers-Rousseeuw (robust scaling) & & 5.33 & 4.14 & 5.15 & 4.15 & 4.56 & 3.95 \\ +invariant & & 4.11 & 5.92 & 4.12 & 5.67 & 3.64 & 5.38 \\ +robust invariant & & 4.28 & \textbf{3.65} & \textbf{3.79} & \textbf{3.85} & \textbf{3.37} & \textbf{3.62} \\ \bottomrule \end{tabular} @@ -2231,10 +2264,10 @@ \subsection{Ranking Yeo-Johnson transformations: effect of sequence \begin{table} \caption{ Comparison of average normalised residual error between Yeo-Johnson transformation methods based on either the residual error (without outliers) or the residual error of the central portion -(with outliers; $\kappa = 0.80$) over 3 datasets with 16001 sequences each. Residual errors were normalised by sequence length to reduce the effect of sequence length on the residual error. -Each sequence contained 30 samples or fewer. The clean dataset consists of sequences derived through inverse Yeo-Johnson transformation -of data sampled from a standard normal distribution. The dirty dataset contains sequences sampled from asymmetric generalised normal distributions, centred at 0. -The shifted dataset also contains sequences sampled from asymmetric generalised normal distributions, but centred at 100, and scaled by 0.001. +(with outliers; $\kappa = 0.80$) over 3 datasets with 16001 features each. Residual errors were normalised by feature length to reduce the effect of feature length on the residual error. +Each feature contained 30 samples or fewer. The clean dataset consists of features derived through inverse Yeo-Johnson transformation +of data sampled from a standard normal distribution. The dirty dataset contains features sampled from asymmetric generalised normal distributions, centred at 0. +The shifted dataset also contains features sampled from asymmetric generalised normal distributions, but centred at 1000. Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. A lower normalised residual error is better. Discrepancies with the rank-based method may occur, as even with normalisation residual errors are not fully comparable between experiments, whereas the ranks achieved within each experiment are. @@ -2249,10 +2282,10 @@ \subsection{Ranking Yeo-Johnson transformations: effect of sequence \midrule none & & 0.246 & 0.173 & 0.186 & 0.156 & 0.187 & 0.157 \\ -conventional & & 0.139 & 0.142 & 0.142 & 0.132 & 0.187 & 0.157 \\ +conventional & & 0.139 & 0.142 & 0.142 & 0.132 & 0.184 & 0.155 \\ conventional (z-score norm.) & & 0.139 & 0.142 & 0.142 & 0.130 & 0.142 & 0.130 \\ conventional (robust scaling) & & 0.139 & 0.142 & 0.141 & 0.130 & 0.141 & 0.130 \\ -Raymaekers-Rousseeuw & & 0.211 & 0.151 & 0.183 & 1.515 & 0.187 & 0.157 \\ +Raymaekers-Rousseeuw & & 0.211 & 0.151 & 0.183 & 1.515 & 0.185 & 0.155 \\ Raymaekers-Rousseeuw (z-score norm.) & & 0.295 & 0.161 & 0.175 & 0.156 & 0.175 & 0.156 \\ Raymaekers-Rousseeuw (robust scaling) & & 0.209 & 0.155 & 0.172 & 0.144 & 0.172 & 0.144 \\ invariant & & 0.141 & 0.144 & 0.140 & 0.132 & 0.140 & 0.132 \\ @@ -2265,10 +2298,10 @@ \subsection{Ranking Yeo-Johnson transformations: effect of sequence \begin{table} \caption{ Comparison of average rank between Yeo-Johnson transformation methods based on either residual error (without outliers) or residual error of the central portion -(with outliers; $\kappa = 0.80$) over 3 datasets with 33037 sequences each. Each sequence contained between 1000 and 10000 samples. -The clean dataset consists of sequences derived through inverse Yeo-Johnson transformation -of data sampled from a standard normal distribution. The dirty dataset contains sequences sampled from asymmetric generalised normal distributions, centred at 0. -The shifted dataset also contains sequences sampled from asymmetric generalised normal distributions, but centred at 100, and scaled by 0.001. +(with outliers; $\kappa = 0.80$) over 3 datasets with 33037 features each. Each feature contained between 1000 and 10000 samples. +The clean dataset consists of features derived through inverse Yeo-Johnson transformation +of data sampled from a standard normal distribution. The dirty dataset contains features sampled from asymmetric generalised normal distributions, centred at 0. +The shifted dataset also contains features sampled from asymmetric generalised normal distributions, but centred at 1000. Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. A rank of 1 is the best and a rank of 9 the worst. For each dataset, the best ranking transformation is marked in bold. } @@ -2281,15 +2314,15 @@ \subsection{Ranking Yeo-Johnson transformations: effect of sequence \midrule -none & & 8.85 & 8.14 & 7.35 & 6.29 & 7.41 & 6.87 \\ -conventional & & 3.50 & 5.65 & 3.52 & 7.78 & 6.42 & 6.05 \\ -conventional (z-score norm.) & & 5.24 & 5.94 & 7.34 & 4.98 & 6.26 & 4.79 \\ -conventional (robust scaling) & & 3.53 & 6.98 & 5.97 & 5.79 & 4.87 & 5.65 \\ -Raymaekers-Rousseeuw & & 4.15 & 2.81 & 3.37 & 3.72 & 6.19 & 5.79 \\ -Raymaekers-Rousseeuw (z-score norm.) & & 5.34 & \textbf{1.86} & 6.57 & 3.39 & 5.49 & 3.30 \\ -Raymaekers-Rousseeuw (robust scaling) & & 4.20 & 2.33 & 5.21 & 3.36 & 4.12 & 3.23 \\ -invariant & & \textbf{2.36} & 7.59 & \textbf{2.66} & 6.82 & \textbf{1.98} & 6.81 \\ -robust invariant & & 7.83 & 3.70 & 3.01 & \textbf{2.88} & 2.26 & \textbf{2.51} \\ +none & & 8.85 & 8.14 & 7.35 & 6.29 & 7.40 & 6.86 \\ +conventional & & 3.50 & 5.65 & 3.52 & 7.78 & 6.24 & 6.45 \\ +conventional (z-score norm.) & & 5.24 & 5.94 & 7.34 & 4.98 & 6.29 & 4.92 \\ +conventional (robust scaling) & & 3.53 & 6.98 & 5.97 & 5.79 & 4.90 & 5.79 \\ +Raymaekers-Rousseeuw & & 4.15 & 2.81 & 3.37 & 3.72 & 6.22 & 5.04 \\ +Raymaekers-Rousseeuw (z-score norm.) & & 5.34 & \textbf{1.86} & 6.57 & 3.39 & 5.52 & 3.35 \\ +Raymaekers-Rousseeuw (robust scaling) & & 4.20 & 2.33 & 5.21 & 3.36 & 4.16 & 3.27 \\ +invariant & & \textbf{2.36} & 7.59 & \textbf{2.66} & 6.82 & \textbf{1.99} & 6.82 \\ +robust invariant & & 7.83 & 3.70 & 3.01 & \textbf{2.88} & 2.27 & \textbf{2.52} \\ \bottomrule \end{tabular} @@ -2298,10 +2331,10 @@ \subsection{Ranking Yeo-Johnson transformations: effect of sequence \begin{table} \caption{ Comparison of average normalised residual error between Yeo-Johnson transformation methods based on either the residual error (without outliers) or the residual error of the central portion -(with outliers; $\kappa = 0.80$) over 3 datasets with 16001 sequences each. Residual errors were normalised by sequence length to reduce the effect of sequence length on the residual error. -Each sequence contained between 1000 and 10000 samples. The clean dataset consists of sequences derived through inverse Yeo-Johnson transformation -of data sampled from a standard normal distribution. The dirty dataset contains sequences sampled from asymmetric generalised normal distributions, centred at 0. -The shifted dataset also contains sequences sampled from asymmetric generalised normal distributions, but centred at 100, and scaled by 0.001. +(with outliers; $\kappa = 0.80$) over 3 datasets with 16001 features each. Residual errors were normalised by feature length to reduce the effect of feature length on the residual error. +Each feature contained between 1000 and 10000 samples. The clean dataset consists of features derived through inverse Yeo-Johnson transformation +of data sampled from a standard normal distribution. The dirty dataset contains features sampled from asymmetric generalised normal distributions, centred at 0. +The shifted dataset also contains features sampled from asymmetric generalised normal distributions, but centred at 1000. Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. A lower normalised residual error is better. Discrepancies with the rank-based method may occur, as even with normalisation residual errors are not fully comparable between experiments, whereas the ranks achieved within each experiment are. @@ -2316,10 +2349,10 @@ \subsection{Ranking Yeo-Johnson transformations: effect of sequence \midrule none & & 0.186 & 0.091 & 0.106 & 0.053 & 0.106 & 0.053 \\ -conventional & & 0.014 & 0.076 & 0.060 & 0.057 & 0.106 & 0.053 \\ +conventional & & 0.014 & 0.076 & 0.060 & 0.057 & 0.103 & 0.053 \\ conventional (z-score norm.) & & 0.021 & 0.076 & 0.070 & 0.050 & 0.070 & 0.050 \\ conventional (robust scaling) & & 0.014 & 0.076 & 0.069 & 0.050 & 0.069 & 0.050 \\ -Raymaekers-Rousseeuw & & 0.015 & 0.021 & 0.060 & 0.030 & 0.106 & 0.053 \\ +Raymaekers-Rousseeuw & & 0.015 & 0.021 & 0.060 & 0.030 & 0.103 & 0.051 \\ Raymaekers-Rousseeuw (z-score norm.) & & 0.021 & 0.020 & 0.070 & 0.030 & 0.070 & 0.030 \\ Raymaekers-Rousseeuw (robust scaling) & & 0.015 & 0.020 & 0.068 & 0.030 & 0.068 & 0.030 \\ invariant & & 0.013 & 0.084 & 0.059 & 0.055 & 0.059 & 0.055 \\ @@ -2337,18 +2370,18 @@ \subsection{Ranking Yeo-Johnson transformations: effect of sequence \subsection{Examples using clean data}\label{examples-using-clean-data} Robust transformations are hypothesised to have a cost in efficiency for -data without outliers, i.e.~clean data. Here we draw nine sequences with +data without outliers, i.e.~clean data. Here we draw nine features with elements randomly drawn from a standard normal distribution \(\mathcal{N}(0,1)\). Subsequently, we perform an inverse transformation \(\left(\phi^{\lambda, 0, 1}\right)^{-1}\), with \(\lambda \in \left(0, 2\right)\). -The sequences drawn resulted from the permutations of the number of -elements of each sequence (\(n \in \{30, 100, 500\}\)) and -transformation parameter for the inverse transformation -\(\lambda \in \{0.1, 1.0, 1.9\}\). Each sequence then underwent power +The features drawn resulted from the permutations of the number of +elements of each feature (\(n \in \{30, 100, 500\}\)) and transformation +parameter for the inverse transformation +\(\lambda \in \{0.1, 1.0, 1.9\}\). Each feature then underwent power transformation using Box-Cox and Yeo-Johnson transformations. For -Box-Cox transformations, each sequence was shifted prior to inverse +Box-Cox transformations, each feature was shifted prior to inverse transformation ensuring all elements were strictly positive after inverse transformation. diff --git a/manuscript_latex/refs.bib b/manuscript_latex/refs.bib index d1dc70a..d69530e 100644 --- a/manuscript_latex/refs.bib +++ b/manuscript_latex/refs.bib @@ -453,3 +453,46 @@ @ARTICLE{Croux1992-zz doi = "10.1080/03610929208830889", issn = "0361-0926,1532-415X" } + +@INCOLLECTION{van-der-Vaart1998-xu, + title = "Quantiles and Order Statistics", + author = "van der Vaart, A W", + booktitle = "Asymptotic Statistics", + publisher = "Cambridge University Press", + address = "Cambridge", + pages = "304--315", + month = oct, + year = 1998, + doi = "10.1017/cbo9780511802256.022", + isbn = "9780521496032,9780521784504" +} + + +@ARTICLE{Huber1964-xk, + title = "Robust estimation of a location parameter", + author = "Huber, Peter J", + journal = "Ann. Math. Stat.", + publisher = "Institute of Mathematical Statistics", + volume = 35, + number = 1, + pages = "73--101", + month = mar, + year = 1964, + doi = "10.1214/aoms/1177703732", + issn = "0003-4851,2168-8990" +} + + +@ARTICLE{Rousseeuw1994-ur, + title = "The bias of k-step {M}-estimators", + author = "Rousseeuw, Peter J and Croux, Christophe", + journal = "Stat. Probab. Lett.", + publisher = "Elsevier BV", + volume = 20, + number = 5, + pages = "411--420", + month = aug, + year = 1994, + doi = "10.1016/0167-7152(94)90133-3", + issn = "0167-7152,1879-2103" +} diff --git a/manuscript_rmarkdown/gc_theorem_data.RDS b/manuscript_rmarkdown/gc_theorem_data.RDS new file mode 100644 index 0000000..ddf69d0 Binary files /dev/null and b/manuscript_rmarkdown/gc_theorem_data.RDS differ diff --git a/manuscript_rmarkdown/manuscript.Rmd b/manuscript_rmarkdown/manuscript.Rmd index a7de312..94a6522 100644 --- a/manuscript_rmarkdown/manuscript.Rmd +++ b/manuscript_rmarkdown/manuscript.Rmd @@ -101,7 +101,7 @@ The two most commonly used transformations are that of @Box1964-mz and @Yeo2000- The Box-Cox transformation is defined as: \begin{definition}[Box-Cox power transformation] -Let $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i > 0 \right\}$ be a finite sequence of length $n > 0$. +Let $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i > 0 \right\}$ be a vector of length $n > 0$. Let $\lambda \in \mathcal{R}$ be a transformation parameter. Then the Box-Cox power transformation of element $x_i$ is defined as: \begin{equation} @@ -118,7 +118,7 @@ One limitation of the Box-Cox transformation is that it is only defined for $x_i In contrast, the Yeo-Johnson transformation under the transformation parameter $\lambda$ is defined for any $x_i \in \mathbb{R}$, as follows: \begin{definition}[Yeo-Johnson power transformation] -Let $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i \in \mathbb{R} \right\}$ be a finite sequence of length $n > 0$. +Let $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i \in \mathbb{R} \right\}$ be a vector of length $n > 0$. Let $\lambda \in \mathcal{R}$ be a transformation parameter. \begin{equation} \label{eqn:yeo-johnson-original} @@ -182,7 +182,7 @@ First, we define a (numeric) feature. Throughout this work, whenever reference i a numeric feature is meant unless noted otherwise. \begin{definition}[Numeric feature] -A numeric feature is a finite sequence $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i \in \right\}$ of length $n$. +A numeric feature is a vector $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i \in \mathbb{R} \right\}$ of length $n$. \end{definition} ## Location- and scale-invariant power transformation @@ -198,7 +198,7 @@ is then defined as: \begin{definition}[Location- and scale- invariant Box-Cox power transformation] Let $\lambda \in \mathcal{R}$ be a transformation parameter. Let $x_0 \in \mathcal{R}$ and $s > 0$ be location and scale parameters. -Let $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i - x_0 > 0 \right\}$ be a finite sequence of length $n > 0$. +Let $\mathbf{X}$ be a numeric feature of length $n > 0$. Then the location- and scale-invariant Box-Cox power transformation of element $x_i$ is defined as: @@ -218,7 +218,7 @@ under transformation parameter $\lambda$, shift parameter $x_0$ and scale parame \begin{definition}[Location- and scale- invariant Yeo-Johnson power transformation] Let $\lambda \in \mathcal{R}$ be a transformation parameter. Let $x_0 \in \mathcal{R}$ and $s > 0$ be location and scale parameters. -Let $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i \in \mathcal{R} \right\}$ be a finite sequence of length $n > 0$. +Let $\mathbf{X}$ be a numeric feature of length $n > 0$. Then the location- and scale-invariant Yeo-Johnson power transformation of element $x_i$ is defined as: @@ -241,7 +241,7 @@ The location- and scale-invariant Box-Cox log-likelihood function is: \begin{definition} [Log-likelihood function of the location- and scale- invariant Box-Cox power transformation] Let $\mathbf{X}$, $\lambda$, $x_0$, and $s$ be defined as earlier (Eqn. \ref{eqn:box-cox-invariant}). -Let $\mu$ and $\sigma^2$ be the mean and variance of the transformed sequence $\phi_{\text{BC}}^{\lambda, x_0, s} (\mathbf{X})$, respectively. +Let $\mu$ and $\sigma^2$ be the mean and variance of the transformed feature $\phi_{\text{BC}}^{\lambda, x_0, s} (\mathbf{X})$, respectively. Then the log-likelihood function of the location- and scale- invariant Box-Cox power transformation is defined as: \begin{equation} @@ -257,7 +257,7 @@ Similarly, the location- and scale-invariant Yeo-Johnson log-likelihood function \begin{definition}[Log-likelihood function of the location- and scale- invariant Yeo-Johnson power transformation] Let $\mathbf{X}$, $\lambda$, $x_0$, and $s$ be defined as earlier (Eqn. \ref{eqn:yeo-johnson-invariant}). -Let $\mu$ and $\sigma^2$ be the mean and variance of the transformed sequence $\phi_{\text{YJ}}^{\lambda, x_0, s} (\mathbf{X})$, respectively. +Let $\mu$ and $\sigma^2$ be the mean and variance of the transformed feature $\phi_{\text{YJ}}^{\lambda, x_0, s} (\mathbf{X})$, respectively. Then the log-likelihood function of the location- and scale- invariant Yeo-Johnson power transformation is defined as: \begin{equation} @@ -288,12 +288,12 @@ The weighted location- and scale-invariant Box-Cox log-likelihood function is: \begin{definition}[Weighted log-likelihood function of the location- and scale- invariant Box-Cox power transformation] Let $\mathbf{X}$, $\lambda$, $x_0$, and $s$ be defined as earlier (Eqn. \ref{eqn:box-cox-invariant}). Let $w_i \geq 0$ be the weight corresponding to each element of $\mathbf{X}$. -Let $\mu_w$ be the weighted mean of the Box-Cox transformed sequence: +Let $\mu_w$ be the weighted mean of the Box-Cox transformed feature: \begin{equation*} \mu_w = \frac{\sum_{i=1}^n w_i \phi_{\text{BC}}^{\lambda, x_0, s} (x_i)} {\sum_{i=1}^n w_i} \end{equation*} -Let $\sigma^2_w$ be the weighted variance of the Box-Cox transformed sequence: +Let $\sigma^2_w$ be the weighted variance of the Box-Cox transformed feature: \begin{equation*} \sigma_w^2 = \frac{\sum_{i=1}^n w_i \left(\phi_{\text{BC}}^{\lambda, x_0, s} (x_i) - \mu_w \right)^2}{\sum_{i=1}^n w_i} \end{equation*} @@ -317,12 +317,12 @@ Analogously, the weighted location- and scale-invariant Yeo-Johnson log-likeliho Let $\mathbf{X}$, $\lambda$, $x_0$, and $s$ be defined as earlier (Eqn. \ref{eqn:yeo-johnson-invariant}). Let $w_i \geq 0$ be the weight corresponding to each element of $\mathbf{X}$. -Let $\mu_w$ be the weighted mean of the Yeo-Johnson transformed sequence: +Let $\mu_w$ be the weighted mean of the Yeo-Johnson transformed feature: \begin{equation*} \mu_w = \frac{\sum_{i=1}^n w_i \phi_{\text{YJ}}^{\lambda, x_0, s} (x_i)} {\sum_{i=1}^n w_i} \end{equation*} -Let $\sigma^2_w$ be the weighted variance of the Yeo-Johnson transformed sequence: +Let $\sigma^2_w$ be the weighted variance of the Yeo-Johnson transformed feature: \begin{equation*} \sigma_w^2 = \frac{\sum_{i=1}^n w_i \left(\phi_{\text{YJ}}^{\lambda, x_0, s} (x_i) - \mu_w \right)^2}{\sum_{i=1}^n w_i} \end{equation*} @@ -482,8 +482,8 @@ Deviations from normality can be detected by normality tests, such as the Shapir In practice, normality tests may be too stringent with large sample sizes, outliers, or both. Here we develop an empirical test for central normality. -\begin{definition}[Central portion of a sequence] -Let $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i \in \mathbb{R}\right\}$ be a finite sequence of length $n > 0$, +\begin{definition}[Central portion of a numeric feature] +Let $\mathbf{X}$ be a numeric feature of length $n > 0$, as before, ordered so that $x_1 \leq x_2 \leq \ldots \leq x_n$. Let $p_i = \frac{i - 1/3}{n + 1/3}$, with $i = 1, 2, \ldots n$ be the percentile value corresponding to each element in $\mathbf{X}$. For elements with tied values in $\mathbf{X}$, @@ -497,7 +497,7 @@ $\mathbf{X}_{\text{c}} = \left\{x_i \in \mathbf{X} \, | \, \frac{1-\kappa}{2} \ \end{definition} -\begin{definition}[Residual errors of the central portion of a sequence] +\begin{definition}[Residual errors of the central portion of a feature] Let the residual error for each element of $\mathbf{X}$ be $r_i =\left| \frac{x_i - \mu_M}{\sigma_M} - F^{-1}_{\mathcal{N}}(p_i) \right|$, with $\mu_M$ and $\sigma_M$ robust Huber M-estimates of location and scale of $\mathbf{X}$ [@Huber1981-su], @@ -510,31 +510,51 @@ $\mathbf{R}_{\text{c}} = \left\{ r_i \in \left\{ r_1, r_2, \ldots, r_n\right\} \ \begin{definition}[Central normal distribution] -A central normal distribution is any distribution whose quantile function is -defined by $F^{-1}_{\mathcal{N}}$ for $\frac{1-\kappa}{2} \leq p \leq \frac{1 + \kappa}{2}$, +A central normal distribution is any distribution whose quantile function +satisfies $F^{-1}_{\mathcal{N}}$ for $\frac{1-\kappa}{2} \leq p \leq \frac{1 + \kappa}{2}$, with $\mathcal{N}$ any normal distribution. \end{definition} \begin{corollary} -For $\kappa = 1.0$, the central normal distribution is a normal distribution. +A normal distribution is a central normal distribution. \end{corollary} \begin{corollary} -By the Glivenko-Cantelli theorem, if a sequence $\mathbf{X}$ was sampled from a -central normal distribution, then almost surely -$\lim_{n \rightarrow \infty} \sum_{r_j \in \mathbf{R}_{\text{c}}} r_j = 0$, +For $\kappa = 1.0$, a central normal distribution is a normal distribution. \end{corollary} +\begin{corollary} + +If $\mathbf{X}$ is randomly sampled from a normal distribution, +$\sup_{r_j \in \mathbf{R}_{\text{c}}} r_j \rightarrow 0$, almost surely. +That is, every residual of the central portion of $\mathbf{X}$ will asymptotically +converge to 0. + +\end{corollary} + +The latter corollary follows from the definition of the residuals and the restriction +to normal distributions. Residuals were defined using +$\frac{x_i - \mu_M}{\sigma_M} \approx \hat{F}^{-1}_{\mathcal{N}}(p_i)$, with +$\hat{F}^{-1}_{\mathcal{N}}(p_i)$ the empirical quantile function resembling +the quantile function of the normal distribution $\mathcal{N}(0, 1)$. +Since $\hat{F}^{-1}_{\mathcal{N}}(p) \rightarrow F^{-1}_{\mathcal{N}}(p)$ with +$\frac{1-\kappa}{2} \leq p \leq \frac{1 + \kappa}{2}$ [@van-der-Vaart1998-xu], +$\sup_{r_j \in \mathbf{R}_{\text{c}}} r_j \rightarrow 0$, if and only if +for all $x_i \in \mathbf{X}_c$, $\frac{x_i - \mu_M}{\sigma_M} \rightarrow \hat{F}^{-1}_{\mathcal{N}}(p_i)$. +Thus, the requirement is that the estimators for location and scale $\mu_M$ and $\sigma_M$ +are bias-free. M-estimators for location and scale satisfy this condition, if $\mathbf{X}$ +follows a normal distribution without contamination [@Huber1964-xk; @Huber1981-su]. + \begin{definition}[Central normality test] -The null-hypothesis $\mathcal{H}_0$ is that the sequence $\mathbf{X}$ was sampled +The null-hypothesis $\mathcal{H}_0$ is that $\mathbf{X}$ was randomly sampled from a population that is distributed according to a central normal distribution. -The alternative hypothesis is that the sequence $\mathbf{X}$ was sampled from a +The alternative hypothesis is that $\mathbf{X}$ was randomly sampled from a population that is not distributed according to a central normal distribution. \end{definition} @@ -567,7 +587,7 @@ Of note, the `power.transform` package shifts feature values into the positive d ## Invariance to location and scale To assess whether the proposed power transformations lead to values of $\lambda$ -that are invariant to location and scale of the distribution, we simulated three different sequences. +that are invariant to location and scale of the distribution, we simulated three different numeric features. We first randomly drew $10000$ values from a normal distribution: $\mathbf{X}_{\text{normal}} = \left\{x_1, x_2, \ldots, x_{10000} \right\} \sim \mathcal{N}\left(0, 1\right)$, or equivalently $\mathbf{X}_{\text{normal}} = \left\{x_1, x_2, \ldots, x_{10000} \right\} \sim \mathcal{AGN}\left(0, 1/\sqrt{2}, 0.5, 2\right)$. @@ -598,12 +618,12 @@ In contrast, estimation of $\lambda$ for invariant power transformations was inv ## Empirical central normality test {#simulation-ecn-test} The critical test statistic for the empirical central normality test depends on -the significance level $\alpha$, sequence length $n$ and central portion $\kappa$. +the significance level $\alpha$, feature length $n$ and central portion $\kappa$. Critical test statistic values were set through simulation, as described below. For each number of samples $n \in \left\{\lfloor 10^\nu \rfloor | \nu \in \left\{0.7500, 0.8125, \ldots, 4.0000 \right \} \right\}$ -we randomly sampled $m_d = 30000$ ordered sequences $\mathbf{X}$ from $\mathcal{N}(0,1)$. -For each sequence we then computed $\tau_{\kappa}$ for +we randomly sampled $m_d = 30000$ ordered numeric features $\mathbf{X}$ from $\mathcal{N}(0,1)$. +For each feature we then computed $\tau_{\kappa}$ for $\kappa \in \left\{0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 1.00\right\}$. ```{r include = FALSE, eval = FALSE} @@ -642,19 +662,19 @@ cap <- paste0( ``` To assess type I error rates for the empirical central normality test and compare -these to the Shapiro-Wilk test for normality, we randomly drew another 10000 sequences from a +these to the Shapiro-Wilk test for normality, we randomly drew another 10000 numeric features from a central normal distribution for each $n \in \left\{\lfloor 10^\nu \rfloor | \nu \in \left\{0.7500, 0.8125, \ldots, 3.0000 \right \} \right\}$. -Each sequence was created as follows: first $n$ values are sampled from $\mathcal{N}(0, 1)$. -Then, the elements outside the central portion of the sequence (i.e. $\mathbf{X} \setminus \mathbf{X}_{\text{c}}$) are replaced. +Each feature was created as follows: first $n$ values are sampled from $\mathcal{N}(0, 1)$. +Then, the elements outside the central portion of the feature (i.e. $\mathbf{X} \setminus \mathbf{X}_{\text{c}}$) are replaced. Elements with $p_i < \frac{1-\kappa}{2}$ are replaced with values sampled from $\mathcal{U}(-10,\min(\mathbf{X}_{\text{c}}))$, and elements with $p_i > \frac{1 + \kappa}{2}$ are replaced with values sampled from $\mathcal{U}(\max(\mathbf{X}_{\text{c}}), 10)$. -For each sequence, the p-value for the respective test was used to reject the null -hypothesis that sequences were derived from a population with a (central) normal distribution. +For each feature, the p-value for the respective test was used to reject the null +hypothesis that features were derived from a population with a (central) normal distribution. The null hypothesis was rejected if $p \leq 0.05$. -The type I error rate was determined by computing the fraction of sequences rejected this way. +The type I error rate was determined by computing the fraction of features rejected this way. Figure \ref{fig:empirical-central-normality-error_rate} shows that when the central portion $\kappa$ of the central normal distribution is equal or greater than the $\kappa$ parameter for the empirical central normality test, the tests @@ -725,13 +745,13 @@ the z-score of the transformed feature values, or the residual error between the z-score of the transformed feature values and their expected z-score based on the normal distribution. To determine the weighting function parameters for each of the nine combinations, -$m_d=500$ sequences $\mathbf{X}_i$ ($i \in \{1, 2, \ldots, m_d\}$) were randomly +$m_d=500$ numeric features $\mathbf{X}_i$ ($i \in \{1, 2, \ldots, m_d\}$) were randomly drawn from randomly parametrised asymmetric generalised normal distributions. Each distribution was parametrised with a random skewness parameter $\alpha \sim U\left(0.01, 0.99\right)$ and shape parameter $\beta \sim U\left(1.00, 5.00 \right)$. Location and scale parameters were set as $\mu = 0$ and $\sigma = 1$, respectively. -To form each sequence $\mathbf{X}_i$, $n = \lceil 10^\gamma \rceil$ instances were then randomly drawn, -with $\gamma \sim U\left(\log_{10}50, 3\right)$, resulting in a set of sequences with between $50$ and $1000$ elements each. +To form each feature $\mathbf{X}_i$, $n = \lceil 10^\gamma \rceil$ instances were then randomly drawn, +with $\gamma \sim U\left(\log_{10}50, 3\right)$, resulting in a set of features with between $50$ and $1000$ elements each. Outlier values were then drawn to randomly replace 10 percent of the elements of $\mathbf{X}_i$. These values were set according to @Tukey1977-xm, as follows. Let $x^{*} \sim U\left(-2, 2\right)$. @@ -751,13 +771,13 @@ Here, $Q_1$, $Q_3$ and $\text{IQR}$ are the first quartile, third quartile and i To find the optimal values for the weighting function parameters $\delta_1$ and $\delta_2$ (if applicable), we minimised a composite loss $L = \sum_{i=1}^{m_d} L_{\text{cn},i} + 0.1 \sum_{i=1}^{m_d} L_{\lambda,i}$. The composite loss consisted of two components: a loss term $L_{\text{cn},i} = \tau_{n_i, \kappa = 0.80,i}$, i.e. the mean of residual -errors of the central 80% of elements of each sequence $\mathbf{X}_i$, which aimed at optimising central normality; +errors of the central 80% of elements of each feature $\mathbf{X}_i$, which aimed at optimising central normality; and a loss term $L_{\lambda,i} = \max\left(0.0,|\lambda_{0,i} - \lambda_{i}| - \xi \right)$, with $\lambda_{0,i}$ and $\lambda_{i}$ transformation parameters found -for sequence $\mathbf{X}_i$ prior to and after adding outliers, respectively, +for feature $\mathbf{X}_i$ prior to and after adding outliers, respectively, and tolerance parameter $\xi = 0.5$ for Box-Cox and $\xi = 0.3$ for Yeo-Johnson power transformations. This second term aimed to prevent solutions that provide small improvements in central normality -at the cost of a poor fit of the tails of sequences. +at the cost of a poor fit of the tails of features. Minimisation was conducted using the BOBYQA algorithm for derivative-free bound constraint optimisation [@Powell2009-zb]. The resulting weighting function parameters for weighted MLE are shown in Tables @@ -855,47 +875,55 @@ $r$ (tapered cosine) & 0.50 & $(0, 10]$ & 1.06 & 1.00 & $(0, 10]$ & 1.07 & 6 ## Assessing transformations using simulated data -Three datasets with 100000 sequences each were created to assess transformation -to normality. For each sequence $\mathbf{X}_i$, $n = \lceil 10^\gamma \rceil$ +Three datasets with 100000 numeric features each were created to assess transformation +to normality. For each feature $\mathbf{X}_i$, $n = \lceil 10^\gamma \rceil$ elements were randomly drawn, with $\gamma \sim U\left(1, 4\right)$, -resulting in sequences with between $10$ and $10000$ elements. +resulting in features with between $10$ and $10000$ elements. -- A \textit{clean} dataset with each sequence drawn from $\mathcal{N}(0, 1)$ and +- A \textit{clean} dataset with each feature drawn from $\mathcal{N}(0, 1)$ and transformed using an inverse power transformation with randomly drawn transformation parameter $\lambda \sim U\left(0.00, 2.00 \right)$. $\left(\phi_{\text{BC}}^\lambda\right)^{-1}$ and $\left(\phi_{\text{YJ}}^\lambda\right)^{-1}$ were used as inverse transformations for assessing Box-Cox and Yeo-Johnson transformations, - respectively. Prior to inverse transformation, sequences for Box-Cox transformations + respectively. Prior to inverse transformation, features for Box-Cox transformations were shifted into the positive domain, with minimum value $1$. -- A \textit{dirty} dataset with each sequence drawn from $\mathcal{AGN}\left(\mu = 0, \sigma = 1/\sqrt{2}, \alpha, \beta \right)$, +- A \textit{dirty} dataset with each feature drawn from $\mathcal{AGN}\left(\mu = 0, \sigma = 1/\sqrt{2}, \alpha, \beta \right)$, with randomly drawn skewness parameter $\alpha \sim U\left(0.01, 0.99\right)$ and shape parameter $\beta \sim U\left(1.00, 5.00 \right)$. + The features are drawn from a range of asymmetric unimodal distributions + to reflect a wider range of generating distributions than for the + data presented in the first dataset. -- A \textit{shifted} dataset with each sequence drawn from $\mathcal{AGN}\left(\mu = 100, \sigma = 10^{-3} \cdot 1/\sqrt{2} , \alpha, \beta \right)$, +- A \textit{shifted} dataset with each feature drawn from + $\mathcal{AGN}\left(\mu = 1000, \sigma = 1/\sqrt{2} , \alpha, \beta \right)$, with randomly drawn skewness parameter $\alpha \sim U\left(0.01, 0.99\right)$ and shape parameter $\beta \sim U\left(1.00, 5.00 \right)$. + The generating process creates features similar to those in the \textit{dirty} + dataset, but shifted to a location that leads to normalisation issues observed + for conventional power transformation methods, as shown in Figure \ref{fig:decreased-normality}. A dataset with outliers was created for each of the above datasets by replacing -10 percent of elements in each sequence, as described earlier. +10 percent of elements in each feature, as described earlier. In addition to no power transformation and location- and scale-invariant power transformations, conventional and Raymaekers and Rousseeuw's robust adaptation [@Raymaekers2024-zf] were assessed. For the latter two methods, normalisation before standardisation using was additionally assessed using the following two methods: -1. z-standardisation: $x^{\prime}_{i} = \left(x_i - \mu \right) / \sigma$, with $\mu$ and $\sigma$ the mean and standard deviation of sequence $\mathbf{X}$. +1. z-standardisation: $x^{\prime}_{i} = \left(x_i - \mu \right) / \sigma$, with $\mu$ and $\sigma$ the + mean and standard deviation of $\mathbf{X}$. 2. robust scaling: $x^{\prime}_{i} = \left(x_i - \text{median}\left(\mathbf{X}\right) \right) / Q_n\left(\mathbf{X}\right)$, with $Q_n$ representing the robust $Q_n$ scale estimator of $\mathbf{X}$ [@Croux1992-zz; @Rousseeuw1993-it]. This results in nine types of power transformation. Transformation parameters -were optimised for each sequence. Subsequently the sum of residual errors +were optimised for each feature. Subsequently the sum of residual errors and sum of residual errors of the central portion ($\kappa = 0.80$) were computed -for each sequence after transformation. Then, each method was ranked according +for each feature after transformation. Then, each method was ranked according to the sum of residual errors (data without outliers) or the sum of residual errors of the central portion (data with outliers). The average rank of each -transformation method was computed over all 100000 sequences in each dataset. +transformation method was computed over all 100000 features in each dataset. Average ranks for Yeo-Johnson transformations are shown in Table \ref{tab:comparison_methods_simulations_yeo_johnson}. Location and shift-invariant Yeo-Johnson transformation ranked best for @@ -937,9 +965,9 @@ data.table::dcast(transformation ~ dataset + outlier, data = table_data, value.v \begin{center} \caption{ Comparison of average rank between Yeo-Johnson transformation methods based on either residual error (without outliers) or residual error of the central portion -(with outliers; $\kappa = 0.80$) over 3 datasets with 100000 sequences each. The clean dataset consists of sequences derived through inverse Yeo-Johnson transformation -of data sampled from a standard normal distribution. The dirty dataset contains sequences sampled from asymmetric generalised normal distributions, centred at 0. -The shifted dataset also contains sequences sampled from asymmetric generalised normal distributions, but centred at 100, and scaled by 0.001. +(with outliers; $\kappa = 0.80$) over 3 datasets with 100000 features each. The clean dataset consists of features derived through inverse Yeo-Johnson transformation +of data sampled from a standard normal distribution. The dirty dataset contains features sampled from asymmetric generalised normal distributions, centred at 0. +The shifted dataset also contains features sampled from asymmetric generalised normal distributions, but centred at 1000. Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. A rank of 1 is the best and a rank of 9 the worst. For each dataset, the best ranking transformation is marked in bold. } @@ -952,15 +980,15 @@ transformation & outliers: & no & yes & no & yes & no & yes \\ \midrule -none & & 8.52 & 7.62 & 7.35 & 6.17 & 7.46 & 6.62 \\ -conventional & & 3.82 & 5.78 & 3.86 & 7.08 & 6.51 & 5.95 \\ -conventional (z-score norm.) & & 4.70 & 5.75 & 6.44 & 5.07 & 5.48 & 4.93 \\ -conventional (robust scaling) & & 3.83 & 6.61 & 5.36 & 5.72 & 4.41 & 5.59 \\ -Raymaekers-Rousseeuw & & 4.64 & 3.22 & 4.01 & 4.11 & 6.28 & 5.67 \\ -Raymaekers-Rousseeuw (z-score norm.) & & 5.24 & \textbf{2.66} & 6.25 & 3.62 & 5.29 & 3.50 \\ -Raymaekers-Rousseeuw (robust scaling) & & 4.70 & 2.99 & 5.22 & 3.69 & 4.27 & 3.57 \\ -invariant & & \textbf{3.11} & 6.74 & \textbf{3.24} & 6.13 & \textbf{2.63} & 6.03 \\ -robust invariant & & 6.44 & 3.64 & 3.27 & \textbf{3.40} & 2.65 & \textbf{3.14} \\ +none & & 8.52 & 7.62 & 7.35 & 6.17 & 7.45 & 6.61 \\ +conventional & & 3.82 & 5.78 & 3.86 & 7.08 & 6.34 & 6.20 \\ +conventional (z-score norm.) & & 4.70 & 5.75 & 6.44 & 5.07 & 5.50 & 4.99 \\ +conventional (robust scaling) & & 3.83 & 6.61 & 5.36 & 5.72 & 4.43 & 5.67 \\ +Raymaekers-Rousseeuw & & 4.64 & 3.22 & 4.01 & 4.11 & 6.36 & 5.22 \\ +Raymaekers-Rousseeuw (z-score norm.) & & 5.24 & \textbf{2.66} & 6.25 & 3.62 & 5.31 & 3.53 \\ +Raymaekers-Rousseeuw (robust scaling) & & 4.70 & 2.99 & 5.22 & 3.69 & 4.29 & 3.59 \\ +invariant & & \textbf{3.11} & 6.74 & \textbf{3.24} & 6.13 & \textbf{2.64} & 6.05 \\ +robust invariant & & 6.44 & 3.64 & 3.27 & \textbf{3.40} & 2.66 & \textbf{3.14} \\ \bottomrule \end{tabular} @@ -1322,7 +1350,7 @@ Consequently, their weights change at each iteration in the MLE optimisation pro This increases local variance in the log-likelihood function and creates local optima that the optimiser may not handle well. Methods that relied on the empirical probability did not suffer from this issue, as weights remained fixed during MLE. -We introduced an empirical test for central normality to assess whether sequences +We introduced an empirical test for central normality to assess whether features deviate from normality in a way that might require closer inspection prior to further processing. The empirical central normality test differs from other tests for normality, such as the Shapiro-Wilk test [@Shapiro1965-zd], as it assesses normality of the central portion @@ -1332,10 +1360,14 @@ normality test remains consistent when a feature was not sampled from a normal d such as a central normal distribution with $\kappa < 1.0$. This enables assessing reasonable normality of (transformed) features in the presence of outliers. -This work has the following limitation: We observed several numerical stability +This work has the following limitations: We observed several numerical stability issues for optimisation criteria other than MLE (Appendix B). These appear in regions where transformation parameters would lead to very large or small numbers when using conventional power transformations. For MLE stability issues were not observed. +Moreover, the definition of central normal distribution does not constrain the +distribution of the non-central tails. Thus, the test may be conservative for +central normal distributions other than uncontaminated normal distributions, because the +location and scale estimators are not generally bias-free [@Rousseeuw1994-ur]. # Conclusion diff --git a/manuscript_rmarkdown/manuscript.tex b/manuscript_rmarkdown/manuscript.tex index aef3f7e..cba8c54 100644 --- a/manuscript_rmarkdown/manuscript.tex +++ b/manuscript_rmarkdown/manuscript.tex @@ -78,7 +78,7 @@ \title{Location and Scale-Invariant Power Transformations for Transforming Data to Normality} \author{Alex Zwanenburg, Steffen Löck} -\date{2025-10-29} +\date{2025-12-19} \begin{document} \maketitle @@ -118,7 +118,7 @@ \section{Introduction}\label{introduction} The Box-Cox transformation is defined as: \begin{definition}[Box-Cox power transformation] -Let $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i > 0 \right\}$ be a finite sequence of length $n > 0$. +Let $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i > 0 \right\}$ be a vector of length $n > 0$. Let $\lambda \in \mathcal{R}$ be a transformation parameter. Then the Box-Cox power transformation of element $x_i$ is defined as: \begin{equation} @@ -137,7 +137,7 @@ \section{Introduction}\label{introduction} \(x_i \in \mathbb{R}\), as follows: \begin{definition}[Yeo-Johnson power transformation] -Let $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i \in \mathbb{R} \right\}$ be a finite sequence of length $n > 0$. +Let $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i \in \mathbb{R} \right\}$ be a vector of length $n > 0$. Let $\lambda \in \mathcal{R}$ be a transformation parameter. \begin{equation} \label{eqn:yeo-johnson-original} @@ -219,7 +219,7 @@ \section{Theory}\label{theory} otherwise. \begin{definition}[Numeric feature] -A numeric feature is a finite sequence $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i \in \right\}$ of length $n$. +A numeric feature is a vector $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i \in \mathbb{R} \right\}$ of length $n$. \end{definition} \subsection{Location- and scale-invariant power @@ -239,7 +239,7 @@ \subsection{Location- and scale-invariant power \begin{definition}[Location- and scale- invariant Box-Cox power transformation] Let $\lambda \in \mathcal{R}$ be a transformation parameter. Let $x_0 \in \mathcal{R}$ and $s > 0$ be location and scale parameters. -Let $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i - x_0 > 0 \right\}$ be a finite sequence of length $n > 0$. +Let $\mathbf{X}$ be a numeric feature of length $n > 0$. Then the location- and scale-invariant Box-Cox power transformation of element $x_i$ is defined as: @@ -260,7 +260,7 @@ \subsection{Location- and scale-invariant power \begin{definition}[Location- and scale- invariant Yeo-Johnson power transformation] Let $\lambda \in \mathcal{R}$ be a transformation parameter. Let $x_0 \in \mathcal{R}$ and $s > 0$ be location and scale parameters. -Let $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i \in \mathcal{R} \right\}$ be a finite sequence of length $n > 0$. +Let $\mathbf{X}$ be a numeric feature of length $n > 0$. Then the location- and scale-invariant Yeo-Johnson power transformation of element $x_i$ is defined as: @@ -284,7 +284,7 @@ \subsection{Location- and scale-invariant power \begin{definition} [Log-likelihood function of the location- and scale- invariant Box-Cox power transformation] Let $\mathbf{X}$, $\lambda$, $x_0$, and $s$ be defined as earlier (Eqn. \ref{eqn:box-cox-invariant}). -Let $\mu$ and $\sigma^2$ be the mean and variance of the transformed sequence $\phi_{\text{BC}}^{\lambda, x_0, s} (\mathbf{X})$, respectively. +Let $\mu$ and $\sigma^2$ be the mean and variance of the transformed feature $\phi_{\text{BC}}^{\lambda, x_0, s} (\mathbf{X})$, respectively. Then the log-likelihood function of the location- and scale- invariant Box-Cox power transformation is defined as: \begin{equation} @@ -301,7 +301,7 @@ \subsection{Location- and scale-invariant power \begin{definition}[Log-likelihood function of the location- and scale- invariant Yeo-Johnson power transformation] Let $\mathbf{X}$, $\lambda$, $x_0$, and $s$ be defined as earlier (Eqn. \ref{eqn:yeo-johnson-invariant}). -Let $\mu$ and $\sigma^2$ be the mean and variance of the transformed sequence $\phi_{\text{YJ}}^{\lambda, x_0, s} (\mathbf{X})$, respectively. +Let $\mu$ and $\sigma^2$ be the mean and variance of the transformed feature $\phi_{\text{YJ}}^{\lambda, x_0, s} (\mathbf{X})$, respectively. Then the log-likelihood function of the location- and scale- invariant Yeo-Johnson power transformation is defined as: \begin{equation} @@ -345,12 +345,12 @@ \subsection{Robust location- and scale-invariant power \begin{definition}[Weighted log-likelihood function of the location- and scale- invariant Box-Cox power transformation] Let $\mathbf{X}$, $\lambda$, $x_0$, and $s$ be defined as earlier (Eqn. \ref{eqn:box-cox-invariant}). Let $w_i \geq 0$ be the weight corresponding to each element of $\mathbf{X}$. -Let $\mu_w$ be the weighted mean of the Box-Cox transformed sequence: +Let $\mu_w$ be the weighted mean of the Box-Cox transformed feature: \begin{equation*} \mu_w = \frac{\sum_{i=1}^n w_i \phi_{\text{BC}}^{\lambda, x_0, s} (x_i)} {\sum_{i=1}^n w_i} \end{equation*} -Let $\sigma^2_w$ be the weighted variance of the Box-Cox transformed sequence: +Let $\sigma^2_w$ be the weighted variance of the Box-Cox transformed feature: \begin{equation*} \sigma_w^2 = \frac{\sum_{i=1}^n w_i \left(\phi_{\text{BC}}^{\lambda, x_0, s} (x_i) - \mu_w \right)^2}{\sum_{i=1}^n w_i} \end{equation*} @@ -373,12 +373,12 @@ \subsection{Robust location- and scale-invariant power Let $\mathbf{X}$, $\lambda$, $x_0$, and $s$ be defined as earlier (Eqn. \ref{eqn:yeo-johnson-invariant}). Let $w_i \geq 0$ be the weight corresponding to each element of $\mathbf{X}$. -Let $\mu_w$ be the weighted mean of the Yeo-Johnson transformed sequence: +Let $\mu_w$ be the weighted mean of the Yeo-Johnson transformed feature: \begin{equation*} \mu_w = \frac{\sum_{i=1}^n w_i \phi_{\text{YJ}}^{\lambda, x_0, s} (x_i)} {\sum_{i=1}^n w_i} \end{equation*} -Let $\sigma^2_w$ be the weighted variance of the Yeo-Johnson transformed sequence: +Let $\sigma^2_w$ be the weighted variance of the Yeo-Johnson transformed feature: \begin{equation*} \sigma_w^2 = \frac{\sum_{i=1}^n w_i \left(\phi_{\text{YJ}}^{\lambda, x_0, s} (x_i) - \mu_w \right)^2}{\sum_{i=1}^n w_i} \end{equation*} @@ -573,8 +573,8 @@ \subsection{Empirical central normality tests may be too stringent with large sample sizes, outliers, or both. Here we develop an empirical test for central normality. -\begin{definition}[Central portion of a sequence] -Let $\mathbf{X} = \left\{x_1, x_2, \ldots, x_n | x_i \in \mathbb{R}\right\}$ be a finite sequence of length $n > 0$, +\begin{definition}[Central portion of a numeric feature] +Let $\mathbf{X}$ be a numeric feature of length $n > 0$, as before, ordered so that $x_1 \leq x_2 \leq \ldots \leq x_n$. Let $p_i = \frac{i - 1/3}{n + 1/3}$, with $i = 1, 2, \ldots n$ be the percentile value corresponding to each element in $\mathbf{X}$. For elements with tied values in $\mathbf{X}$, @@ -588,7 +588,7 @@ \subsection{Empirical central normality \end{definition} -\begin{definition}[Residual errors of the central portion of a sequence] +\begin{definition}[Residual errors of the central portion of a feature] Let the residual error for each element of $\mathbf{X}$ be $r_i =\left| \frac{x_i - \mu_M}{\sigma_M} - F^{-1}_{\mathcal{N}}(p_i) \right|$, with $\mu_M$ and $\sigma_M$ robust Huber M-estimates of location and scale of $\mathbf{X}$ [@Huber1981-su], @@ -601,31 +601,56 @@ \subsection{Empirical central normality \begin{definition}[Central normal distribution] -A central normal distribution is any distribution whose quantile function is -defined by $F^{-1}_{\mathcal{N}}$ for $\frac{1-\kappa}{2} \leq p \leq \frac{1 + \kappa}{2}$, +A central normal distribution is any distribution whose quantile function +satisfies $F^{-1}_{\mathcal{N}}$ for $\frac{1-\kappa}{2} \leq p \leq \frac{1 + \kappa}{2}$, with $\mathcal{N}$ any normal distribution. \end{definition} \begin{corollary} -For $\kappa = 1.0$, the central normal distribution is a normal distribution. +A normal distribution is a central normal distribution. \end{corollary} \begin{corollary} -By the Glivenko-Cantelli theorem, if a sequence $\mathbf{X}$ was sampled from a -central normal distribution, then almost surely -$\lim_{n \rightarrow \infty} \sum_{r_j \in \mathbf{R}_{\text{c}}} r_j = 0$, +For $\kappa = 1.0$, a central normal distribution is a normal distribution. \end{corollary} +\begin{corollary} + +If $\mathbf{X}$ is randomly sampled from a normal distribution, +$\sup_{r_j \in \mathbf{R}_{\text{c}}} r_j \rightarrow 0$, almost surely. +That is, every residual of the central portion of $\mathbf{X}$ will asymptotically +converge to 0. + +\end{corollary} + +The latter corollary follows from the definition of the residuals and +the restriction to normal distributions. Residuals were defined using +\(\frac{x_i - \mu_M}{\sigma_M} \approx \hat{F}^{-1}_{\mathcal{N}}(p_i)\), +with \(\hat{F}^{-1}_{\mathcal{N}}(p_i)\) the empirical quantile function +resembling the quantile function of the normal distribution +\(\mathcal{N}(0, 1)\). Since +\(\hat{F}^{-1}_{\mathcal{N}}(p) \rightarrow F^{-1}_{\mathcal{N}}(p)\) +with \(\frac{1-\kappa}{2} \leq p \leq \frac{1 + \kappa}{2}\) +\citep{van-der-Vaart1998-xu}, +\(\sup_{r_j \in \mathbf{R}_{\text{c}}} r_j \rightarrow 0\), if and only +if for all \(x_i \in \mathbf{X}_c\), +\(\frac{x_i - \mu_M}{\sigma_M} \rightarrow \hat{F}^{-1}_{\mathcal{N}}(p_i)\). +Thus, the requirement is that the estimators for location and scale +\(\mu_M\) and \(\sigma_M\) are bias-free. M-estimators for location and +scale satisfy this condition, if \(\mathbf{X}\) follows a normal +distribution without contamination {[}Huber1964-xk; +\citet{Huber1981-su}{]}. + \begin{definition}[Central normality test] -The null-hypothesis $\mathcal{H}_0$ is that the sequence $\mathbf{X}$ was sampled +The null-hypothesis $\mathcal{H}_0$ is that $\mathbf{X}$ was randomly sampled from a population that is distributed according to a central normal distribution. -The alternative hypothesis is that the sequence $\mathbf{X}$ was sampled from a +The alternative hypothesis is that $\mathbf{X}$ was randomly sampled from a population that is not distributed according to a central normal distribution. \end{definition} @@ -667,8 +692,8 @@ \subsection{Invariance to location and To assess whether the proposed power transformations lead to values of \(\lambda\) that are invariant to location and scale of the -distribution, we simulated three different sequences. We first randomly -drew \(10000\) values from a normal distribution: +distribution, we simulated three different numeric features. We first +randomly drew \(10000\) values from a normal distribution: \(\mathbf{X}_{\text{normal}} = \left\{x_1, x_2, \ldots, x_{10000} \right\} \sim \mathcal{N}\left(0, 1\right)\), or equivalently \(\mathbf{X}_{\text{normal}} = \left\{x_1, x_2, \ldots, x_{10000} \right\} \sim \mathcal{AGN}\left(0, 1/\sqrt{2}, 0.5, 2\right)\). @@ -706,15 +731,15 @@ \subsection{Invariance to location and \subsection{Empirical central normality test}\label{simulation-ecn-test} The critical test statistic for the empirical central normality test -depends on the significance level \(\alpha\), sequence length \(n\) and +depends on the significance level \(\alpha\), feature length \(n\) and central portion \(\kappa\). Critical test statistic values were set through simulation, as described below. For each number of samples \(n \in \left\{\lfloor 10^\nu \rfloor | \nu \in \left\{0.7500, 0.8125, \ldots, 4.0000 \right \} \right\}\) -we randomly sampled \(m_d = 30000\) ordered sequences \(\mathbf{X}\) -from \(\mathcal{N}(0,1)\). For each sequence we then computed -\(\tau_{\kappa}\) for +we randomly sampled \(m_d = 30000\) ordered numeric features +\(\mathbf{X}\) from \(\mathcal{N}(0,1)\). For each feature we then +computed \(\tau_{\kappa}\) for \(\kappa \in \left\{0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 1.00\right\}\). Figure \ref{fig:empirical-central-normality-combined} shows the critical @@ -734,23 +759,23 @@ \subsection{Empirical central normality test}\label{simulation-ecn-test} To assess type I error rates for the empirical central normality test and compare these to the Shapiro-Wilk test for normality, we randomly -drew another 10000 sequences from a central normal distribution for each +drew another 10000 numeric features from a central normal distribution +for each \(n \in \left\{\lfloor 10^\nu \rfloor | \nu \in \left\{0.7500, 0.8125, \ldots, 3.0000 \right \} \right\}\). -Each sequence was created as follows: first \(n\) values are sampled -from \(\mathcal{N}(0, 1)\). Then, the elements outside the central -portion of the sequence -(i.e.~\(\mathbf{X} \setminus \mathbf{X}_{\text{c}}\)) are replaced. -Elements with \(p_i < \frac{1-\kappa}{2}\) are replaced with values -sampled from \(\mathcal{U}(-10,\min(\mathbf{X}_{\text{c}}))\), and -elements with \(p_i > \frac{1 + \kappa}{2}\) are replaced with values -sampled from \(\mathcal{U}(\max(\mathbf{X}_{\text{c}}), 10)\). - -For each sequence, the p-value for the respective test was used to -reject the null hypothesis that sequences were derived from a population -with a (central) normal distribution. The null hypothesis was rejected -if \(p \leq 0.05\). The type I error rate was determined by computing -the fraction of sequences rejected this way. Figure +Each feature was created as follows: first \(n\) values are sampled from +\(\mathcal{N}(0, 1)\). Then, the elements outside the central portion of +the feature (i.e.~\(\mathbf{X} \setminus \mathbf{X}_{\text{c}}\)) are +replaced. Elements with \(p_i < \frac{1-\kappa}{2}\) are replaced with +values sampled from \(\mathcal{U}(-10,\min(\mathbf{X}_{\text{c}}))\), +and elements with \(p_i > \frac{1 + \kappa}{2}\) are replaced with +values sampled from \(\mathcal{U}(\max(\mathbf{X}_{\text{c}}), 10)\). + +For each feature, the p-value for the respective test was used to reject +the null hypothesis that features were derived from a population with a +(central) normal distribution. The null hypothesis was rejected if +\(p \leq 0.05\). The type I error rate was determined by computing the +fraction of features rejected this way. Figure \ref{fig:empirical-central-normality-error_rate} shows that when the central portion \(\kappa\) of the central normal distribution is equal or greater than the \(\kappa\) parameter for the empirical central @@ -808,19 +833,18 @@ \subsection{Robust transformations}\label{robust-transformations} normal distribution. To determine the weighting function parameters for each of the nine -combinations, \(m_d=500\) sequences \(\mathbf{X}_i\) +combinations, \(m_d=500\) numeric features \(\mathbf{X}_i\) (\(i \in \{1, 2, \ldots, m_d\}\)) were randomly drawn from randomly parametrised asymmetric generalised normal distributions. Each distribution was parametrised with a random skewness parameter \(\alpha \sim U\left(0.01, 0.99\right)\) and shape parameter \(\beta \sim U\left(1.00, 5.00 \right)\). Location and scale parameters were set as \(\mu = 0\) and \(\sigma = 1\), respectively. To form each -sequence \(\mathbf{X}_i\), \(n = \lceil 10^\gamma \rceil\) instances -were then randomly drawn, with -\(\gamma \sim U\left(\log_{10}50, 3\right)\), resulting in a set of -sequences with between \(50\) and \(1000\) elements each. Outlier values -were then drawn to randomly replace 10 percent of the elements of -\(\mathbf{X}_i\). These values were set according to +feature \(\mathbf{X}_i\), \(n = \lceil 10^\gamma \rceil\) instances were +then randomly drawn, with \(\gamma \sim U\left(\log_{10}50, 3\right)\), +resulting in a set of features with between \(50\) and \(1000\) elements +each. Outlier values were then drawn to randomly replace 10 percent of +the elements of \(\mathbf{X}_i\). These values were set according to \citet{Tukey1977-xm}, as follows. Let \(x^{*} \sim U\left(-2, 2\right)\). Then the corresponding outlier value was: @@ -843,16 +867,16 @@ \subsection{Robust transformations}\label{robust-transformations} \(L = \sum_{i=1}^{m_d} L_{\text{cn},i} + 0.1 \sum_{i=1}^{m_d} L_{\lambda,i}\). The composite loss consisted of two components: a loss term \(L_{\text{cn},i} = \tau_{n_i, \kappa = 0.80,i}\), i.e.~the mean of -residual errors of the central 80\% of elements of each sequence +residual errors of the central 80\% of elements of each feature \(\mathbf{X}_i\), which aimed at optimising central normality; and a loss term \(L_{\lambda,i} = \max\left(0.0,|\lambda_{0,i} - \lambda_{i}| - \xi \right)\), with \(\lambda_{0,i}\) and \(\lambda_{i}\) transformation parameters -found for sequence \(\mathbf{X}_i\) prior to and after adding outliers, +found for feature \(\mathbf{X}_i\) prior to and after adding outliers, respectively, and tolerance parameter \(\xi = 0.5\) for Box-Cox and \(\xi = 0.3\) for Yeo-Johnson power transformations. This second term aimed to prevent solutions that provide small improvements in central -normality at the cost of a poor fit of the tails of sequences. +normality at the cost of a poor fit of the tails of features. Minimisation was conducted using the BOBYQA algorithm for derivative-free bound constraint optimisation \citep{Powell2009-zb}. The @@ -934,40 +958,47 @@ \subsection{Robust transformations}\label{robust-transformations} \subsection{Assessing transformations using simulated data}\label{assessing-transformations-using-simulated-data} -Three datasets with 100000 sequences each were created to assess -transformation to normality. For each sequence \(\mathbf{X}_i\), +Three datasets with 100000 numeric features each were created to assess +transformation to normality. For each feature \(\mathbf{X}_i\), \(n = \lceil 10^\gamma \rceil\) elements were randomly drawn, with -\(\gamma \sim U\left(1, 4\right)\), resulting in sequences with between +\(\gamma \sim U\left(1, 4\right)\), resulting in features with between \(10\) and \(10000\) elements. \begin{itemize} \item - A \textit{clean} dataset with each sequence drawn from + A \textit{clean} dataset with each feature drawn from \(\mathcal{N}(0, 1)\) and transformed using an inverse power transformation with randomly drawn transformation parameter \(\lambda \sim U\left(0.00, 2.00 \right)\). \(\left(\phi_{\text{BC}}^\lambda\right)^{-1}\) and \(\left(\phi_{\text{YJ}}^\lambda\right)^{-1}\) were used as inverse transformations for assessing Box-Cox and Yeo-Johnson transformations, - respectively. Prior to inverse transformation, sequences for Box-Cox + respectively. Prior to inverse transformation, features for Box-Cox transformations were shifted into the positive domain, with minimum value \(1\). \item - A \textit{dirty} dataset with each sequence drawn from + A \textit{dirty} dataset with each feature drawn from \(\mathcal{AGN}\left(\mu = 0, \sigma = 1/\sqrt{2}, \alpha, \beta \right)\), with randomly drawn skewness parameter \(\alpha \sim U\left(0.01, 0.99\right)\) and shape parameter - \(\beta \sim U\left(1.00, 5.00 \right)\). + \(\beta \sim U\left(1.00, 5.00 \right)\). The features are drawn from + a range of asymmetric unimodal distributions to reflect a wider range + of generating distributions than for the data presented in the first + dataset. \item - A \textit{shifted} dataset with each sequence drawn from - \(\mathcal{AGN}\left(\mu = 100, \sigma = 10^{-3} \cdot 1/\sqrt{2} , \alpha, \beta \right)\), + A \textit{shifted} dataset with each feature drawn from + \(\mathcal{AGN}\left(\mu = 1000, \sigma = 1/\sqrt{2} , \alpha, \beta \right)\), with randomly drawn skewness parameter \(\alpha \sim U\left(0.01, 0.99\right)\) and shape parameter - \(\beta \sim U\left(1.00, 5.00 \right)\). + \(\beta \sim U\left(1.00, 5.00 \right)\). The generating process + creates features similar to those in the \textit{dirty} dataset, but + shifted to a location that leads to normalisation issues observed for + conventional power transformation methods, as shown in Figure + \ref{fig:decreased-normality}. \end{itemize} A dataset with outliers was created for each of the above datasets by -replacing 10 percent of elements in each sequence, as described earlier. +replacing 10 percent of elements in each feature, as described earlier. In addition to no power transformation and location- and scale-invariant power transformations, conventional and Raymaekers and Rousseeuw's @@ -980,8 +1011,7 @@ \subsection{Assessing transformations using simulated \item z-standardisation: \(x^{\prime}_{i} = \left(x_i - \mu \right) / \sigma\), with \(\mu\) - and \(\sigma\) the mean and standard deviation of sequence - \(\mathbf{X}\). + and \(\sigma\) the mean and standard deviation of \(\mathbf{X}\). \item robust scaling: \(x^{\prime}_{i} = \left(x_i - \text{median}\left(\mathbf{X}\right) \right) / Q_n\left(\mathbf{X}\right)\), @@ -990,15 +1020,15 @@ \subsection{Assessing transformations using simulated \end{enumerate} This results in nine types of power transformation. Transformation -parameters were optimised for each sequence. Subsequently the sum of +parameters were optimised for each feature. Subsequently the sum of residual errors and sum of residual errors of the central portion -(\(\kappa = 0.80\)) were computed for each sequence after -transformation. Then, each method was ranked according to the sum of -residual errors (data without outliers) or the sum of residual errors of -the central portion (data with outliers). The average rank of each -transformation method was computed over all 100000 sequences in each -dataset. Average ranks for Yeo-Johnson transformations are shown in -Table \ref{tab:comparison_methods_simulation_yeo_johnson}. Location and +(\(\kappa = 0.80\)) were computed for each feature after transformation. +Then, each method was ranked according to the sum of residual errors +(data without outliers) or the sum of residual errors of the central +portion (data with outliers). The average rank of each transformation +method was computed over all 100000 features in each dataset. Average +ranks for Yeo-Johnson transformations are shown in Table +\ref{tab:comparison_methods_simulations_yeo_johnson}. Location and shift-invariant Yeo-Johnson transformation ranked best for all datasets without outliers. The robust variant ranked best in dirty and shifted datasets with outliers, but not in the clean dataset. Results for @@ -1009,9 +1039,9 @@ \subsection{Assessing transformations using simulated \begin{center} \caption{ Comparison of average rank between Yeo-Johnson transformation methods based on either residual error (without outliers) or residual error of the central portion -(with outliers; $\kappa = 0.80$) over 3 datasets with 100000 sequences each. The clean dataset consists of sequences derived through inverse Yeo-Johnson transformation -of data sampled from a standard normal distribution. The dirty dataset contains sequences sampled from asymmetric generalised normal distributions, centred at 0. -The shifted dataset also contains sequences sampled from asymmetric generalised normal distributions, but centred at 100, and scaled by 0.001. +(with outliers; $\kappa = 0.80$) over 3 datasets with 100000 features each. The clean dataset consists of features derived through inverse Yeo-Johnson transformation +of data sampled from a standard normal distribution. The dirty dataset contains features sampled from asymmetric generalised normal distributions, centred at 0. +The shifted dataset also contains features sampled from asymmetric generalised normal distributions, but centred at 1000. Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. A rank of 1 is the best and a rank of 9 the worst. For each dataset, the best ranking transformation is marked in bold. } @@ -1024,15 +1054,15 @@ \subsection{Assessing transformations using simulated \midrule -none & & 8.52 & 7.62 & 7.35 & 6.17 & 7.46 & 6.62 \\ -conventional & & 3.82 & 5.78 & 3.86 & 7.08 & 6.51 & 5.95 \\ -conventional (z-score norm.) & & 4.70 & 5.75 & 6.44 & 5.07 & 5.48 & 4.93 \\ -conventional (robust scaling) & & 3.83 & 6.61 & 5.36 & 5.72 & 4.41 & 5.59 \\ -Raymaekers-Rousseeuw & & 4.64 & 3.22 & 4.01 & 4.11 & 6.28 & 5.67 \\ -Raymaekers-Rousseeuw (z-score norm.) & & 5.24 & \textbf{2.66} & 6.25 & 3.62 & 5.29 & 3.50 \\ -Raymaekers-Rousseeuw (robust scaling) & & 4.70 & 2.99 & 5.22 & 3.69 & 4.27 & 3.57 \\ -invariant & & \textbf{3.11} & 6.74 & \textbf{3.24} & 6.13 & \textbf{2.63} & 6.03 \\ -robust invariant & & 6.44 & 3.64 & 3.27 & \textbf{3.40} & 2.65 & \textbf{3.14} \\ +none & & 8.52 & 7.62 & 7.35 & 6.17 & 7.45 & 6.61 \\ +conventional & & 3.82 & 5.78 & 3.86 & 7.08 & 6.34 & 6.20 \\ +conventional (z-score norm.) & & 4.70 & 5.75 & 6.44 & 5.07 & 5.50 & 4.99 \\ +conventional (robust scaling) & & 3.83 & 6.61 & 5.36 & 5.72 & 4.43 & 5.67 \\ +Raymaekers-Rousseeuw & & 4.64 & 3.22 & 4.01 & 4.11 & 6.36 & 5.22 \\ +Raymaekers-Rousseeuw (z-score norm.) & & 5.24 & \textbf{2.66} & 6.25 & 3.62 & 5.31 & 3.53 \\ +Raymaekers-Rousseeuw (robust scaling) & & 4.70 & 2.99 & 5.22 & 3.69 & 4.29 & 3.59 \\ +invariant & & \textbf{3.11} & 6.74 & \textbf{3.24} & 6.13 & \textbf{2.64} & 6.05 \\ +robust invariant & & 6.44 & 3.64 & 3.27 & \textbf{3.40} & 2.66 & \textbf{3.14} \\ \bottomrule \end{tabular} @@ -1383,7 +1413,7 @@ \section{Discussion}\label{discussion} not suffer from this issue, as weights remained fixed during MLE. We introduced an empirical test for central normality to assess whether -sequences deviate from normality in a way that might require closer +features deviate from normality in a way that might require closer inspection prior to further processing. The empirical central normality test differs from other tests for normality, such as the Shapiro-Wilk test \citep{Shapiro1965-zd}, as it assesses normality of the central @@ -1394,11 +1424,16 @@ \section{Discussion}\label{discussion} enables assessing reasonable normality of (transformed) features in the presence of outliers. -This work has the following limitation: We observed several numerical +This work has the following limitations: We observed several numerical stability issues for optimisation criteria other than MLE (Appendix B). These appear in regions where transformation parameters would lead to very large or small numbers when using conventional power -transformations. For MLE stability issues were not observed. +transformations. For MLE stability issues were not observed. Moreover, +the definition of central normal distribution does not constrain the +distribution of the non-central tails. Thus, the test may be +conservative for central normal distributions other than uncontaminated +normal distributions, because the location and scale estimators are not +generally bias-free \citep{Rousseeuw1994-ur}. \section{Conclusion}\label{conclusion} diff --git a/manuscript_rmarkdown/manuscript_appendix.Rmd b/manuscript_rmarkdown/manuscript_appendix.Rmd index 2b74e4a..d6031ea 100644 --- a/manuscript_rmarkdown/manuscript_appendix.Rmd +++ b/manuscript_rmarkdown/manuscript_appendix.Rmd @@ -587,10 +587,10 @@ data.table::dcast(transformation ~ dataset + outlier, data = table_data, value.v \begin{center} \caption{ Comparison of average rank between Box-Cox transformation methods based on either residual error (without outliers) or residual error of the central portion -(with outliers; $\kappa = 0.80$) over 3 datasets with 100000 sequences each. The clean dataset consists of sequences derived through inverse Box-Cox transformation -of data sampled from a standard normal distribution. The dirty dataset contains sequences sampled from asymmetric generalised normal distributions. -The shifted dataset also contains sequences sampled from asymmetric generalised normal distributions, but centred at 100, and scaled by 0.001. -If necessary, each sequence was shifted so that every element had a strictly positive value. +(with outliers; $\kappa = 0.80$) over 3 datasets with 100000 features each. The clean dataset consists of features derived through inverse Box-Cox transformation +of data sampled from a standard normal distribution. The dirty dataset contains features sampled from asymmetric generalised normal distributions. +The shifted dataset also contains features sampled from asymmetric generalised normal distributions, but centred at 1000. +If necessary, each feature was shifted so that every element had a strictly positive value. Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. A rank of 1 is the best and a rank of 9 the worst. For each dataset, the best ranking transformation is marked in bold. } @@ -603,15 +603,15 @@ transformation & outliers: & no & yes & no & yes & no & yes \\ \midrule -none & & 7.63 & 6.95 & 7.29 & 6.59 & 7.47 & 6.78 \\ -conventional & & 3.11 & 5.98 & 5.41 & 6.18 & 6.51 & 6.10 \\ -conventional (z-score norm.) & & 4.15 & 6.27 & 4.83 & 6.32 & 4.34 & 5.84 \\ -conventional (robust scaling) & & 4.20 & 6.29 & 4.84 & 6.31 & 4.35 & 5.84 \\ -Raymaekers-Rousseeuw & & 4.50 & 3.56 & 5.39 & 3.69 & 6.30 & 5.80 \\ -Raymaekers-Rousseeuw (z-score norm.) & & 6.23 & 3.80 & 4.93 & 3.85 & 4.48 & 3.50 \\ -Raymaekers-Rousseeuw (robust scaling) & & 6.12 & 3.80 & 4.99 & 3.87 & 4.53 & 3.51 \\ -invariant & & \textbf{2.96} & 5.59 & \textbf{3.62} & 6.01 & \textbf{3.51} & 5.58 \\ -robust invariant & & 6.10 & \textbf{2.76} & 3.68 & \textbf{2.19} & \textbf{3.51} & \textbf{2.00} \\ +none & & 7.63 & 6.95 & 7.29 & 6.59 & 7.46 & 6.77 \\ +conventional & & 3.11 & 5.98 & 5.41 & 6.18 & 6.51 & 6.26 \\ +conventional (z-score norm.) & & 4.15 & 6.27 & 4.83 & 6.32 & 4.39 & 5.96 \\ +conventional (robust scaling) & & 4.20 & 6.29 & 4.84 & 6.31 & 4.40 & 5.96 \\ +Raymaekers-Rousseeuw & & 4.50 & 3.56 & 5.39 & 3.69 & 6.29 & 5.35 \\ +Raymaekers-Rousseeuw (z-score norm.) & & 6.23 & 3.80 & 4.93 & 3.85 & 4.53 & 3.51 \\ +Raymaekers-Rousseeuw (robust scaling) & & 6.12 & 3.80 & 4.99 & 3.87 & 4.58 & 3.52 \\ +invariant & & \textbf{2.96} & 5.59 & \textbf{3.62} & 6.01 & \textbf{3.53} & 5.67 \\ +robust invariant & & 6.10 & \textbf{2.76} & 3.68 & \textbf{2.19} & 3.55 & \textbf{2.00} \\ \bottomrule \end{tabular} @@ -624,22 +624,22 @@ robust invariant & & 6.10 & \textbf{2.76} & ## Ranking Yeo-Johnson transformations: residual errors In the main manuscript, transformation methods are compared by ranking within -each experiment, defined by the combination of sequence and absence/presence of +each experiment, defined by the combination of feature and absence/presence of outliers. This allows for direct comparison of methods between different experiments. Table \ref{tab:comparison_methods_simulations_yeo_johnson_avg_residual} shows the average normalised residual error for these transformation methods. This metric is computed from the residual error and the residual error of the central -portion ($\kappa = 0.80$) for sequences without and with outliers, respectively. +portion ($\kappa = 0.80$) for features without and with outliers, respectively. In each experiment, the respective residual error was normalised by dividing with -the sequence length, and can be interpreted as the average residual error of a -single sample. This reduces the effect of sequence length on the residual error. +the feature length, and can be interpreted as the average residual error of a +single sample. This reduces the effect of feature length on the residual error. Table \ref{tab:comparison_methods_simulations_yeo_johnson_avg_residual_diff} shows the average normalised difference in residual error between the best method for each experiment and the individual methods. As above, normalisation was done by -division by the length of the sequence corresponding to each experiment. +division by the length of the feature corresponding to each experiment. Both tables show that the differences between best-performing methods on each dataset are relatively small. @@ -692,10 +692,10 @@ data.table::dcast(transformation ~ dataset + outlier, data = table_data, value.v \begin{center} \caption{ Comparison of average normalised residual error between Yeo-Johnson transformation methods based on either the residual error (without outliers) or the residual error of the central portion -(with outliers; $\kappa = 0.80$) over 3 datasets with 100000 sequences each. Residual errors were normalised by sequence length to reduce the effect of sequence length on the residual error. -The clean dataset consists of sequences derived through inverse Yeo-Johnson transformation -of data sampled from a standard normal distribution. The dirty dataset contains sequences sampled from asymmetric generalised normal distributions, centred at 0. -The shifted dataset also contains sequences sampled from asymmetric generalised normal distributions, but centred at 100, and scaled by 0.001. +(with outliers; $\kappa = 0.80$) over 3 datasets with 100000 features each. Residual errors were normalised by feature length to reduce the effect of feature length on the residual error. +The clean dataset consists of features derived through inverse Yeo-Johnson transformation +of data sampled from a standard normal distribution. The dirty dataset contains features sampled from asymmetric generalised normal distributions, centred at 0. +The shifted dataset also contains features sampled from asymmetric generalised normal distributions, but centred at 1000. Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. A lower normalised residual error is better. Discrepancies with the rank-based method may occur, as even with normalisation residual errors are not fully comparable between experiments, whereas the ranks achieved within each experiment are. @@ -710,10 +710,10 @@ transformation & outliers: & no & yes & no & yes & no & yes \\ \midrule none & & 0.202 & 0.112 & 0.128 & 0.080 & 0.129 & 0.080 \\ -conventional & & 0.058 & 0.094 & 0.085 & 0.078 & 0.129 & 0.080 \\ +conventional & & 0.058 & 0.094 & 0.085 & 0.078 & 0.126 & 0.080 \\ conventional (z-score norm.) & & 0.061 & 0.094 & 0.091 & 0.073 & 0.091 & 0.073 \\ conventional (robust scaling) & & 0.058 & 0.094 & 0.090 & 0.073 & 0.090 & 0.073 \\ -Raymaekers-Rousseeuw & & 0.071 & 0.055 & 0.092 & 0.279 & 0.129 & 0.080 \\ +Raymaekers-Rousseeuw & & 0.071 & 0.055 & 0.092 & 0.279 & 0.126 & 0.079 \\ Raymaekers-Rousseeuw (z-score norm.) & & 0.087 & 0.057 & 0.097 & 0.060 & 0.097 & 0.060 \\ Raymaekers-Rousseeuw (robust scaling) & & 0.071 & 0.056 & 0.095 & 0.059 & 0.095 & 0.059 \\ invariant & & 0.057 & 0.099 & 0.083 & 0.075 & 0.083 & 0.075 \\ @@ -731,16 +731,16 @@ robust invariant & & 0.069 & 0.066 & 0.084 & 0.058 & 0.084 \caption{ Comparison of average difference in residual error between Yeo-Johnson transformation methods and the best transformation method within each -experiment, normalised by the corresponding sequence length. This is based on +experiment, normalised by the corresponding feature length. This is based on either the residual error (without outliers) or the residual error of the central portion (with outliers; $\kappa = 0.80$) over 3 datasets with 100000 -sequences each. Residual errors were normalised by sequence length to reduce the -effect of sequence length on the residual error. The clean dataset consists of -sequences derived through inverse Yeo-Johnson transformation of data sampled -from a standard normal distribution. The dirty dataset contains sequences +features each. Residual errors were normalised by feature length to reduce the +effect of feature length on the residual error. The clean dataset consists of +features derived through inverse Yeo-Johnson transformation of data sampled +from a standard normal distribution. The dirty dataset contains features sampled from asymmetric generalised normal distributions, centred at 0. The -shifted dataset also contains sequences sampled from asymmetric generalised -normal distributions, but centred at 100, and scaled by 0.001. Several +shifted dataset also contains features sampled from asymmetric generalised +normal distributions, but centred at 1000. Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. A lower difference is better, with a minimum value of 0.0. Discrepancies with the rank-based method may occur, @@ -757,10 +757,10 @@ transformation & outliers: & no & yes & no & yes & no & yes \\ \midrule none & & 0.150 & 0.068 & 0.050 & 0.033 & 0.049 & 0.033 \\ -conventional & & 0.005 & 0.049 & 0.006 & 0.032 & 0.049 & 0.033 \\ +conventional & & 0.005 & 0.049 & 0.006 & 0.032 & 0.046 & 0.032 \\ conventional (z-score norm.) & & 0.008 & 0.049 & 0.013 & 0.026 & 0.011 & 0.025 \\ conventional (robust scaling) & & 0.005 & 0.050 & 0.012 & 0.026 & 0.010 & 0.026 \\ -Raymaekers-Rousseeuw & & 0.018 & 0.011 & 0.014 & 0.233 & 0.049 & 0.033 \\ +Raymaekers-Rousseeuw & & 0.018 & 0.011 & 0.014 & 0.233 & 0.046 & 0.031 \\ Raymaekers-Rousseeuw (z-score norm.) & & 0.034 & 0.012 & 0.018 & 0.014 & 0.017 & 0.013 \\ Raymaekers-Rousseeuw (robust scaling) & & 0.018 & 0.011 & 0.017 & 0.012 & 0.015 & 0.011 \\ invariant & & 0.004 & 0.054 & 0.005 & 0.029 & 0.003 & 0.028 \\ @@ -773,21 +773,21 @@ robust invariant & & 0.016 & 0.021 & 0.006 & 0.011 & 0.004 \FloatBarrier -## Ranking Yeo-Johnson transformations: effect of sequence length +## Ranking Yeo-Johnson transformations: effect of feature length The main manuscript shows the overall ranking of different Yeo-Johnson-based power transformation methods across three sets of problems. -To assess the effect of limited sequence lengths, we perform ranking on subsets of the experiment. -The results for ranking the subset of sequences with 30 samples or fewer ($n = 16001$) +To assess the effect of limited feature lengths, we perform ranking on subsets of the experiment. +The results for ranking the subset of features with 30 samples or fewer ($n = 16001$) is shown in Table \ref{tab:comparison_methods_simulations_yeo_johnson_10_30}. -In the clean dataset without outliers, where all sequences were sampled from a normal distribution and +In the clean dataset without outliers, where all features were sampled from a normal distribution and then underwent an inverse Yeo-Johnson transformation, conventional and location- and scale-invariant are ranked similarly, with no clear advantage to any method. For the remaining datasets, the robust location- and scale-invariant method ranked best. It slightly outperformed (robustly scaled) conventional Yeo-Johnson transformations and (z-score normalised) robust transformations [@Raymaekers2024-zf], in datasets without and with outliers, respectively. -In the subset for larger sequence lengths, with at least 1000 samples per sequence ($n = 33037$), +In the subset for larger feature lengths, with at least 1000 samples per feature ($n = 33037$), the cost of making the location- and scale-invariant transformation robust against outliers becomes more apparent (Table \ref{tab:comparison_methods_simulations_yeo_johnson_1000_10000}). In the clean dataset without outliers, down-weighting of the tails of the distribution by @@ -839,10 +839,10 @@ data.table::dcast(transformation ~ dataset + outlier, data = table_data, value.v \begin{center} \caption{ Comparison of average rank between Yeo-Johnson transformation methods based on either residual error (without outliers) or residual error of the central portion -(with outliers; $\kappa = 0.80$) over 3 datasets with 16001 sequences each. Each sequence contained 30 samples or fewer. -The clean dataset consists of sequences derived through inverse Yeo-Johnson transformation -of data sampled from a standard normal distribution. The dirty dataset contains sequences sampled from asymmetric generalised normal distributions, centred at 0. -The shifted dataset also contains sequences sampled from asymmetric generalised normal distributions, but centred at 100, and scaled by 0.001. +(with outliers; $\kappa = 0.80$) over 3 datasets with 16001 features each. Each feature contained 30 samples or fewer. +The clean dataset consists of features derived through inverse Yeo-Johnson transformation +of data sampled from a standard normal distribution. The dirty dataset contains features sampled from asymmetric generalised normal distributions, centred at 0. +The shifted dataset also contains features sampled from asymmetric generalised normal distributions, but centred at 1000. Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. A rank of 1 is the best and a rank of 9 the worst. For each dataset, the best ranking transformation is marked in bold. } @@ -855,15 +855,15 @@ transformation & outliers: & no & yes & no & yes & no & yes \\ \midrule -none & & 7.75 & 6.81 & 7.27 & 6.40 & 7.45 & 6.60 \\ -conventional & & 4.19 & 5.47 & 4.48 & 5.78 & 6.55 & 5.94 \\ -conventional (z-score norm.) & & 4.36 & 5.12 & 5.01 & 4.99 & 4.35 & 4.70 \\ -conventional (robust scaling) & & \textbf{4.05} & 5.63 & 4.38 & 5.49 & 3.76 & 5.20 \\ -Raymaekers-Rousseeuw & & 5.36 & 4.19 & 5.14 & 4.62 & 6.32 & 5.83 \\ -Raymaekers-Rousseeuw (z-score norm.) & & 5.56 & 4.08 & 5.67 & 4.05 & 5.03 & 3.81 \\ -Raymaekers-Rousseeuw (robust scaling) & & 5.33 & 4.14 & 5.15 & 4.15 & 4.55 & 3.94 \\ -invariant & & 4.11 & 5.92 & 4.12 & 5.67 & 3.62 & 5.37 \\ -robust invariant & & 4.28 & \textbf{3.65} & \textbf{3.79} & \textbf{3.85} & \textbf{3.37} & \textbf{3.61} \\ +none & & 7.75 & 6.81 & 7.27 & 6.40 & 7.44 & 6.59 \\ +conventional & & 4.19 & 5.47 & 4.48 & 5.78 & 6.35 & 6.00 \\ +conventional (z-score norm.) & & 4.36 & 5.12 & 5.01 & 4.99 & 4.36 & 4.71 \\ +conventional (robust scaling) & & \textbf{4.05} & 5.63 & 4.38 & 5.49 & 3.77 & 5.21 \\ +Raymaekers-Rousseeuw & & 5.36 & 4.19 & 5.14 & 4.62 & 6.47 & 5.72 \\ +Raymaekers-Rousseeuw (z-score norm.) & & 5.56 & 4.08 & 5.67 & 4.05 & 5.04 & 3.82 \\ +Raymaekers-Rousseeuw (robust scaling) & & 5.33 & 4.14 & 5.15 & 4.15 & 4.56 & 3.95 \\ +invariant & & 4.11 & 5.92 & 4.12 & 5.67 & 3.64 & 5.38 \\ +robust invariant & & 4.28 & \textbf{3.65} & \textbf{3.79} & \textbf{3.85} & \textbf{3.37} & \textbf{3.62} \\ \bottomrule \end{tabular} @@ -876,10 +876,10 @@ robust invariant & & 4.28 & \textbf{3.65} & \textb \begin{center} \caption{ Comparison of average normalised residual error between Yeo-Johnson transformation methods based on either the residual error (without outliers) or the residual error of the central portion -(with outliers; $\kappa = 0.80$) over 3 datasets with 16001 sequences each. Residual errors were normalised by sequence length to reduce the effect of sequence length on the residual error. -Each sequence contained 30 samples or fewer. The clean dataset consists of sequences derived through inverse Yeo-Johnson transformation -of data sampled from a standard normal distribution. The dirty dataset contains sequences sampled from asymmetric generalised normal distributions, centred at 0. -The shifted dataset also contains sequences sampled from asymmetric generalised normal distributions, but centred at 100, and scaled by 0.001. +(with outliers; $\kappa = 0.80$) over 3 datasets with 16001 features each. Residual errors were normalised by feature length to reduce the effect of feature length on the residual error. +Each feature contained 30 samples or fewer. The clean dataset consists of features derived through inverse Yeo-Johnson transformation +of data sampled from a standard normal distribution. The dirty dataset contains features sampled from asymmetric generalised normal distributions, centred at 0. +The shifted dataset also contains features sampled from asymmetric generalised normal distributions, but centred at 1000. Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. A lower normalised residual error is better. Discrepancies with the rank-based method may occur, as even with normalisation residual errors are not fully comparable between experiments, whereas the ranks achieved within each experiment are. @@ -894,10 +894,10 @@ transformation & outliers: & no & yes & no & yes & no & yes \\ \midrule none & & 0.246 & 0.173 & 0.186 & 0.156 & 0.187 & 0.157 \\ -conventional & & 0.139 & 0.142 & 0.142 & 0.132 & 0.187 & 0.157 \\ +conventional & & 0.139 & 0.142 & 0.142 & 0.132 & 0.184 & 0.155 \\ conventional (z-score norm.) & & 0.139 & 0.142 & 0.142 & 0.130 & 0.142 & 0.130 \\ conventional (robust scaling) & & 0.139 & 0.142 & 0.141 & 0.130 & 0.141 & 0.130 \\ -Raymaekers-Rousseeuw & & 0.211 & 0.151 & 0.183 & 1.515 & 0.187 & 0.157 \\ +Raymaekers-Rousseeuw & & 0.211 & 0.151 & 0.183 & 1.515 & 0.185 & 0.155 \\ Raymaekers-Rousseeuw (z-score norm.) & & 0.295 & 0.161 & 0.175 & 0.156 & 0.175 & 0.156 \\ Raymaekers-Rousseeuw (robust scaling) & & 0.209 & 0.155 & 0.172 & 0.144 & 0.172 & 0.144 \\ invariant & & 0.141 & 0.144 & 0.140 & 0.132 & 0.140 & 0.132 \\ @@ -953,10 +953,10 @@ data.table::dcast(transformation ~ dataset + outlier, data = table_data, value.v \begin{center} \caption{ Comparison of average rank between Yeo-Johnson transformation methods based on either residual error (without outliers) or residual error of the central portion -(with outliers; $\kappa = 0.80$) over 3 datasets with 33037 sequences each. Each sequence contained between 1000 and 10000 samples. -The clean dataset consists of sequences derived through inverse Yeo-Johnson transformation -of data sampled from a standard normal distribution. The dirty dataset contains sequences sampled from asymmetric generalised normal distributions, centred at 0. -The shifted dataset also contains sequences sampled from asymmetric generalised normal distributions, but centred at 100, and scaled by 0.001. +(with outliers; $\kappa = 0.80$) over 3 datasets with 33037 features each. Each feature contained between 1000 and 10000 samples. +The clean dataset consists of features derived through inverse Yeo-Johnson transformation +of data sampled from a standard normal distribution. The dirty dataset contains features sampled from asymmetric generalised normal distributions, centred at 0. +The shifted dataset also contains features sampled from asymmetric generalised normal distributions, but centred at 1000. Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. A rank of 1 is the best and a rank of 9 the worst. For each dataset, the best ranking transformation is marked in bold. } @@ -969,15 +969,15 @@ transformation & outliers: & no & yes & no & yes & no & yes \\ \midrule -none & & 8.85 & 8.14 & 7.35 & 6.29 & 7.41 & 6.87 \\ -conventional & & 3.50 & 5.65 & 3.52 & 7.78 & 6.42 & 6.05 \\ -conventional (z-score norm.) & & 5.24 & 5.94 & 7.34 & 4.98 & 6.26 & 4.79 \\ -conventional (robust scaling) & & 3.53 & 6.98 & 5.97 & 5.79 & 4.87 & 5.65 \\ -Raymaekers-Rousseeuw & & 4.15 & 2.81 & 3.37 & 3.72 & 6.19 & 5.79 \\ -Raymaekers-Rousseeuw (z-score norm.) & & 5.34 & \textbf{1.86} & 6.57 & 3.39 & 5.49 & 3.30 \\ -Raymaekers-Rousseeuw (robust scaling) & & 4.20 & 2.33 & 5.21 & 3.36 & 4.12 & 3.23 \\ -invariant & & \textbf{2.36} & 7.59 & \textbf{2.66} & 6.82 & \textbf{1.98} & 6.81 \\ -robust invariant & & 7.83 & 3.70 & 3.01 & \textbf{2.88} & 2.26 & \textbf{2.51} \\ +none & & 8.85 & 8.14 & 7.35 & 6.29 & 7.40 & 6.86 \\ +conventional & & 3.50 & 5.65 & 3.52 & 7.78 & 6.24 & 6.45 \\ +conventional (z-score norm.) & & 5.24 & 5.94 & 7.34 & 4.98 & 6.29 & 4.92 \\ +conventional (robust scaling) & & 3.53 & 6.98 & 5.97 & 5.79 & 4.90 & 5.79 \\ +Raymaekers-Rousseeuw & & 4.15 & 2.81 & 3.37 & 3.72 & 6.22 & 5.04 \\ +Raymaekers-Rousseeuw (z-score norm.) & & 5.34 & \textbf{1.86} & 6.57 & 3.39 & 5.52 & 3.35 \\ +Raymaekers-Rousseeuw (robust scaling) & & 4.20 & 2.33 & 5.21 & 3.36 & 4.16 & 3.27 \\ +invariant & & \textbf{2.36} & 7.59 & \textbf{2.66} & 6.82 & \textbf{1.99} & 6.82 \\ +robust invariant & & 7.83 & 3.70 & 3.01 & \textbf{2.88} & 2.27 & \textbf{2.52} \\ \bottomrule \end{tabular} @@ -989,10 +989,10 @@ robust invariant & & 7.83 & 3.70 & \begin{center} \caption{ Comparison of average normalised residual error between Yeo-Johnson transformation methods based on either the residual error (without outliers) or the residual error of the central portion -(with outliers; $\kappa = 0.80$) over 3 datasets with 16001 sequences each. Residual errors were normalised by sequence length to reduce the effect of sequence length on the residual error. -Each sequence contained between 1000 and 10000 samples. The clean dataset consists of sequences derived through inverse Yeo-Johnson transformation -of data sampled from a standard normal distribution. The dirty dataset contains sequences sampled from asymmetric generalised normal distributions, centred at 0. -The shifted dataset also contains sequences sampled from asymmetric generalised normal distributions, but centred at 100, and scaled by 0.001. +(with outliers; $\kappa = 0.80$) over 3 datasets with 16001 features each. Residual errors were normalised by feature length to reduce the effect of feature length on the residual error. +Each feature contained between 1000 and 10000 samples. The clean dataset consists of features derived through inverse Yeo-Johnson transformation +of data sampled from a standard normal distribution. The dirty dataset contains features sampled from asymmetric generalised normal distributions, centred at 0. +The shifted dataset also contains features sampled from asymmetric generalised normal distributions, but centred at 1000. Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. A lower normalised residual error is better. Discrepancies with the rank-based method may occur, as even with normalisation residual errors are not fully comparable between experiments, whereas the ranks achieved within each experiment are. @@ -1007,10 +1007,10 @@ transformation & outliers: & no & yes & no & yes & no & yes \\ \midrule none & & 0.186 & 0.091 & 0.106 & 0.053 & 0.106 & 0.053 \\ -conventional & & 0.014 & 0.076 & 0.060 & 0.057 & 0.106 & 0.053 \\ +conventional & & 0.014 & 0.076 & 0.060 & 0.057 & 0.103 & 0.053 \\ conventional (z-score norm.) & & 0.021 & 0.076 & 0.070 & 0.050 & 0.070 & 0.050 \\ conventional (robust scaling) & & 0.014 & 0.076 & 0.069 & 0.050 & 0.069 & 0.050 \\ -Raymaekers-Rousseeuw & & 0.015 & 0.021 & 0.060 & 0.030 & 0.106 & 0.053 \\ +Raymaekers-Rousseeuw & & 0.015 & 0.021 & 0.060 & 0.030 & 0.103 & 0.051 \\ Raymaekers-Rousseeuw (z-score norm.) & & 0.021 & 0.020 & 0.070 & 0.030 & 0.070 & 0.030 \\ Raymaekers-Rousseeuw (robust scaling) & & 0.015 & 0.020 & 0.068 & 0.030 & 0.068 & 0.030 \\ invariant & & 0.013 & 0.084 & 0.059 & 0.055 & 0.059 & 0.055 \\ @@ -1026,15 +1026,15 @@ robust invariant & & 0.034 & 0.042 & 0.061 & 0.032 & 0.061 ## Examples using clean data Robust transformations are hypothesised to have a cost in efficiency for data -without outliers, i.e. clean data. Here we draw nine sequences with elements randomly drawn from a standard normal +without outliers, i.e. clean data. Here we draw nine features with elements randomly drawn from a standard normal distribution $\mathcal{N}(0,1)$. Subsequently, we perform an inverse transformation $\left(\phi^{\lambda, 0, 1}\right)^{-1}$, with $\lambda \in \left(0, 2\right)$. -The sequences drawn resulted from the permutations of the number of elements of each -sequence ($n \in \{30, 100, 500\}$) and transformation parameter for the -inverse transformation $\lambda \in \{0.1, 1.0, 1.9\}$. Each sequence then underwent +The features drawn resulted from the permutations of the number of elements of each +feature ($n \in \{30, 100, 500\}$) and transformation parameter for the +inverse transformation $\lambda \in \{0.1, 1.0, 1.9\}$. Each feature then underwent power transformation using Box-Cox and Yeo-Johnson transformations. For Box-Cox -transformations, each sequence was shifted prior to inverse transformation +transformations, each feature was shifted prior to inverse transformation ensuring all elements were strictly positive after inverse transformation. Results for Box-Cox transformations are shown in Table diff --git a/manuscript_rmarkdown/manuscript_appendix.tex b/manuscript_rmarkdown/manuscript_appendix.tex index ee86c20..ef724eb 100644 --- a/manuscript_rmarkdown/manuscript_appendix.tex +++ b/manuscript_rmarkdown/manuscript_appendix.tex @@ -78,7 +78,7 @@ \title{Appendix} \author{Alex Zwanenburg, Steffen Löck} -\date{2025-10-29} +\date{2025-12-19} \begin{document} \maketitle @@ -94,7 +94,7 @@ \section{Appendix A: Log-likelihood functions for location- and transformation is: \begin{equation} -\phi_{\text{BC}}^{\lambda, x_0, s} (x_i) = +\phi_{\text{BC}}^{\lambda, x_0, s} (x_i) = \begin{cases} \left( \left(\frac{x_i - x_0}{s} \right)^\lambda - 1 \right) / \lambda & \text{if } \lambda \neq 0\\ \log\left[\frac{x_i - x_0}{s}\right] & \text{if } \lambda = 0 @@ -105,7 +105,7 @@ \section{Appendix A: Log-likelihood functions for location- and transformation is: \begin{equation} -\phi_{\text{YJ}}^{\lambda, x_0, s} (x_i) = +\phi_{\text{YJ}}^{\lambda, x_0, s} (x_i) = \begin{cases} \left( \left( 1 + \frac{x_i - x_0}{s}\right)^\lambda - 1\right) / \lambda & \text{if } \lambda \neq 0 \text{ and } x_i - x_0 \geq 0\\ \log\left[1 + \frac{x_i - x_0}{s}\right] & \text{if } \lambda = 0 \text{ and } x_i - x_0 \geq 0\\ @@ -537,7 +537,7 @@ \subsection{Simulations with other optimisation \begin{figure} -{\centering \includegraphics{manuscript_appendix_files/figure-latex/shifted-distributions-appendix-1} +{\centering \includegraphics{manuscript_appendix_files/figure-latex/shifted-distributions-appendix-1} } @@ -545,7 +545,7 @@ \subsection{Simulations with other optimisation \end{figure} \section{Appendix C: Empirical central normality test critical -values}\label{appendix-c} +values}\label{appendix-c-empirical-central-normality-test-critical-values} Critical values for the empirical central normality test at \(\kappa = 0.80\) are found in Table @@ -554,7 +554,7 @@ \section{Appendix C: Empirical central normality test critical \begin{table} \begin{center} \caption{Critical values of test statistic $\tau_{\alpha, n, \kappa = 0.80}$ for -central normality at $\kappa = 0.80$, as a function of significance +central normality at $\kappa = 0.80$, as a function of significance level $\alpha$ and number of instances $n$. The values in this table should be divided by $100$ before use.} \label{tab:empirical-central-normality-critical-statistic} \begin{tabular}{l | r r r r r r r r r} @@ -571,7 +571,7 @@ \section{Appendix C: Empirical central normality test critical 200 & 8.65 & 7.05 & 6.37 & 5.80 & 5.22 & 4.57 & 3.57 & 2.84 & 2.54 \\ 500 & 5.54 & 4.47 & 4.03 & 3.69 & 3.30 & 2.90 & 2.27 & 1.81 & 1.63 \\ 1000 & 3.88 & 3.17 & 2.86 & 2.60 & 2.34 & 2.05 & 1.61 & 1.28 & 1.15 \\ - 2000 & 2.79 & 2.24 & 2.03 & 1.84 & 1.66 & 1.45 & 1.14 & 0.91 & 0.82 \\ + 2000 & 2.79 & 2.24 & 2.03 & 1.84 & 1.66 & 1.45 & 1.14 & 0.91 & 0.82 \\ 5000 & 1.75 & 1.41 & 1.27 & 1.17 & 1.05 & 0.92 & 0.72 & 0.58 & 0.52 \\ 10000 & 1.21 & 1.00 & 0.90 & 0.82 & 0.74 & 0.65 & 0.51 & 0.41 & 0.36 \\ \bottomrule @@ -595,10 +595,10 @@ \subsection{Ranking Box-Cox \begin{center} \caption{ Comparison of average rank between Box-Cox transformation methods based on either residual error (without outliers) or residual error of the central portion -(with outliers; $\kappa = 0.80$) over 3 datasets with 100000 sequences each. The clean dataset consists of sequences derived through inverse Box-Cox transformation -of data sampled from a standard normal distribution. The dirty dataset contains sequences sampled from asymmetric generalised normal distributions. -The shifted dataset also contains sequences sampled from asymmetric generalised normal distributions, but centred at 100, and scaled by 0.001. -If necessary, each sequence was shifted so that every element had a strictly positive value. +(with outliers; $\kappa = 0.80$) over 3 datasets with 100000 features each. The clean dataset consists of features derived through inverse Box-Cox transformation +of data sampled from a standard normal distribution. The dirty dataset contains features sampled from asymmetric generalised normal distributions. +The shifted dataset also contains features sampled from asymmetric generalised normal distributions, but centred at 1000. +If necessary, each feature was shifted so that every element had a strictly positive value. Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. A rank of 1 is the best and a rank of 9 the worst. For each dataset, the best ranking transformation is marked in bold. } @@ -611,15 +611,15 @@ \subsection{Ranking Box-Cox \midrule -none & & 7.63 & 6.95 & 7.29 & 6.59 & 7.47 & 6.78 \\ -conventional & & 3.11 & 5.98 & 5.41 & 6.18 & 6.51 & 6.10 \\ -conventional (z-score norm.) & & 4.15 & 6.27 & 4.83 & 6.32 & 4.34 & 5.84 \\ -conventional (robust scaling) & & 4.20 & 6.29 & 4.84 & 6.31 & 4.35 & 5.84 \\ -Raymaekers-Rousseeuw & & 4.50 & 3.56 & 5.39 & 3.69 & 6.30 & 5.80 \\ -Raymaekers-Rousseeuw (z-score norm.) & & 6.23 & 3.80 & 4.93 & 3.85 & 4.48 & 3.50 \\ -Raymaekers-Rousseeuw (robust scaling) & & 6.12 & 3.80 & 4.99 & 3.87 & 4.53 & 3.51 \\ -invariant & & \textbf{2.96} & 5.59 & \textbf{3.62} & 6.01 & \textbf{3.51} & 5.58 \\ -robust invariant & & 6.10 & \textbf{2.76} & 3.68 & \textbf{2.19} & \textbf{3.51} & \textbf{2.00} \\ +none & & 7.63 & 6.95 & 7.29 & 6.59 & 7.46 & 6.77 \\ +conventional & & 3.11 & 5.98 & 5.41 & 6.18 & 6.51 & 6.26 \\ +conventional (z-score norm.) & & 4.15 & 6.27 & 4.83 & 6.32 & 4.39 & 5.96 \\ +conventional (robust scaling) & & 4.20 & 6.29 & 4.84 & 6.31 & 4.40 & 5.96 \\ +Raymaekers-Rousseeuw & & 4.50 & 3.56 & 5.39 & 3.69 & 6.29 & 5.35 \\ +Raymaekers-Rousseeuw (z-score norm.) & & 6.23 & 3.80 & 4.93 & 3.85 & 4.53 & 3.51 \\ +Raymaekers-Rousseeuw (robust scaling) & & 6.12 & 3.80 & 4.99 & 3.87 & 4.58 & 3.52 \\ +invariant & & \textbf{2.96} & 5.59 & \textbf{3.62} & 6.01 & \textbf{3.53} & 5.67 \\ +robust invariant & & 6.10 & \textbf{2.76} & 3.68 & \textbf{2.19} & 3.55 & \textbf{2.00} \\ \bottomrule \end{tabular} @@ -632,25 +632,24 @@ \subsection{Ranking Yeo-Johnson transformations: residual errors}\label{ranking-yeo-johnson-transformations-residual-errors} In the main manuscript, transformation methods are compared by ranking -within each experiment, defined by the combination of sequence and +within each experiment, defined by the combination of feature and absence/presence of outliers. This allows for direct comparison of methods between different experiments. Table \ref{tab:comparison_methods_simulations_yeo_johnson_avg_residual} shows the average normalised residual error for these transformation methods. This metric is computed from the residual error and the -residual error of the central portion (\(\kappa = 0.80\)) for sequences +residual error of the central portion (\(\kappa = 0.80\)) for features without and with outliers, respectively. In each experiment, the -respective residual error was normalised by dividing with the sequence +respective residual error was normalised by dividing with the feature length, and can be interpreted as the average residual error of a single -sample. This reduces the effect of sequence length on the residual -error. +sample. This reduces the effect of feature length on the residual error. Table \ref{tab:comparison_methods_simulations_yeo_johnson_avg_residual_diff} shows the average normalised difference in residual error between the best method for each experiment and the individual methods. As above, -normalisation was done by division by the length of the sequence +normalisation was done by division by the length of the feature corresponding to each experiment. Both tables show that the differences between best-performing methods on @@ -660,12 +659,12 @@ \subsection{Ranking Yeo-Johnson transformations: residual \begin{center} \caption{ Comparison of average normalised residual error between Yeo-Johnson transformation methods based on either the residual error (without outliers) or the residual error of the central portion -(with outliers; $\kappa = 0.80$) over 3 datasets with 100000 sequences each. Residual errors were normalised by sequence length to reduce the effect of sequence length on the residual error. -The clean dataset consists of sequences derived through inverse Yeo-Johnson transformation -of data sampled from a standard normal distribution. The dirty dataset contains sequences sampled from asymmetric generalised normal distributions, centred at 0. -The shifted dataset also contains sequences sampled from asymmetric generalised normal distributions, but centred at 100, and scaled by 0.001. +(with outliers; $\kappa = 0.80$) over 3 datasets with 100000 features each. Residual errors were normalised by feature length to reduce the effect of feature length on the residual error. +The clean dataset consists of features derived through inverse Yeo-Johnson transformation +of data sampled from a standard normal distribution. The dirty dataset contains features sampled from asymmetric generalised normal distributions, centred at 0. +The shifted dataset also contains features sampled from asymmetric generalised normal distributions, but centred at 1000. Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. -A lower normalised residual error is better. Discrepancies with the rank-based method may occur, as even with normalisation residual errors +A lower normalised residual error is better. Discrepancies with the rank-based method may occur, as even with normalisation residual errors are not fully comparable between experiments, whereas the ranks achieved within each experiment are. } \label{tab:comparison_methods_simulations_yeo_johnson_avg_residual} @@ -678,10 +677,10 @@ \subsection{Ranking Yeo-Johnson transformations: residual \midrule none & & 0.202 & 0.112 & 0.128 & 0.080 & 0.129 & 0.080 \\ -conventional & & 0.058 & 0.094 & 0.085 & 0.078 & 0.129 & 0.080 \\ +conventional & & 0.058 & 0.094 & 0.085 & 0.078 & 0.126 & 0.080 \\ conventional (z-score norm.) & & 0.061 & 0.094 & 0.091 & 0.073 & 0.091 & 0.073 \\ conventional (robust scaling) & & 0.058 & 0.094 & 0.090 & 0.073 & 0.090 & 0.073 \\ -Raymaekers-Rousseeuw & & 0.071 & 0.055 & 0.092 & 0.279 & 0.129 & 0.080 \\ +Raymaekers-Rousseeuw & & 0.071 & 0.055 & 0.092 & 0.279 & 0.126 & 0.079 \\ Raymaekers-Rousseeuw (z-score norm.) & & 0.087 & 0.057 & 0.097 & 0.060 & 0.097 & 0.060 \\ Raymaekers-Rousseeuw (robust scaling) & & 0.071 & 0.056 & 0.095 & 0.059 & 0.095 & 0.059 \\ invariant & & 0.057 & 0.099 & 0.083 & 0.075 & 0.083 & 0.075 \\ @@ -697,16 +696,16 @@ \subsection{Ranking Yeo-Johnson transformations: residual \caption{ Comparison of average difference in residual error between Yeo-Johnson transformation methods and the best transformation method within each -experiment, normalised by the corresponding sequence length. This is based on +experiment, normalised by the corresponding feature length. This is based on either the residual error (without outliers) or the residual error of the central portion (with outliers; $\kappa = 0.80$) over 3 datasets with 100000 -sequences each. Residual errors were normalised by sequence length to reduce the -effect of sequence length on the residual error. The clean dataset consists of -sequences derived through inverse Yeo-Johnson transformation of data sampled -from a standard normal distribution. The dirty dataset contains sequences +features each. Residual errors were normalised by feature length to reduce the +effect of feature length on the residual error. The clean dataset consists of +features derived through inverse Yeo-Johnson transformation of data sampled +from a standard normal distribution. The dirty dataset contains features sampled from asymmetric generalised normal distributions, centred at 0. The -shifted dataset also contains sequences sampled from asymmetric generalised -normal distributions, but centred at 100, and scaled by 0.001. Several +shifted dataset also contains features sampled from asymmetric generalised +normal distributions, but centred at 1000. Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. A lower difference is better, with a minimum value of 0.0. Discrepancies with the rank-based method may occur, @@ -723,10 +722,10 @@ \subsection{Ranking Yeo-Johnson transformations: residual \midrule none & & 0.150 & 0.068 & 0.050 & 0.033 & 0.049 & 0.033 \\ -conventional & & 0.005 & 0.049 & 0.006 & 0.032 & 0.049 & 0.033 \\ +conventional & & 0.005 & 0.049 & 0.006 & 0.032 & 0.046 & 0.032 \\ conventional (z-score norm.) & & 0.008 & 0.049 & 0.013 & 0.026 & 0.011 & 0.025 \\ conventional (robust scaling) & & 0.005 & 0.050 & 0.012 & 0.026 & 0.010 & 0.026 \\ -Raymaekers-Rousseeuw & & 0.018 & 0.011 & 0.014 & 0.233 & 0.049 & 0.033 \\ +Raymaekers-Rousseeuw & & 0.018 & 0.011 & 0.014 & 0.233 & 0.046 & 0.031 \\ Raymaekers-Rousseeuw (z-score norm.) & & 0.034 & 0.012 & 0.018 & 0.014 & 0.017 & 0.013 \\ Raymaekers-Rousseeuw (robust scaling) & & 0.018 & 0.011 & 0.017 & 0.012 & 0.015 & 0.011 \\ invariant & & 0.004 & 0.054 & 0.005 & 0.029 & 0.003 & 0.028 \\ @@ -739,16 +738,16 @@ \subsection{Ranking Yeo-Johnson transformations: residual \FloatBarrier -\subsection{Ranking Yeo-Johnson transformations: effect of sequence -length}\label{ranking-yeo-johnson-transformations-effect-of-sequence-length} +\subsection{Ranking Yeo-Johnson transformations: effect of feature +length}\label{ranking-yeo-johnson-transformations-effect-of-feature-length} The main manuscript shows the overall ranking of different Yeo-Johnson-based power transformation methods across three sets of -problems. To assess the effect of limited sequence lengths, we perform +problems. To assess the effect of limited feature lengths, we perform ranking on subsets of the experiment. The results for ranking the subset -of sequences with 30 samples or fewer (\(n = 16001\)) is shown in Table +of features with 30 samples or fewer (\(n = 16001\)) is shown in Table \ref{tab:comparison_methods_simulations_yeo_johnson_10_30}. In the clean -dataset without outliers, where all sequences were sampled from a normal +dataset without outliers, where all features were sampled from a normal distribution and then underwent an inverse Yeo-Johnson transformation, conventional and location- and scale-invariant are ranked similarly, with no clear advantage to any method. For the remaining datasets, the @@ -758,8 +757,8 @@ \subsection{Ranking Yeo-Johnson transformations: effect of sequence \citep{Raymaekers2024-zf}, in datasets without and with outliers, respectively. -In the subset for larger sequence lengths, with at least 1000 samples -per sequence (\(n = 33037\)), the cost of making the location- and +In the subset for larger feature lengths, with at least 1000 samples per +feature (\(n = 33037\)), the cost of making the location- and scale-invariant transformation robust against outliers becomes more apparent (Table \ref{tab:comparison_methods_simulations_yeo_johnson_1000_10000}). In the @@ -772,10 +771,10 @@ \subsection{Ranking Yeo-Johnson transformations: effect of sequence \begin{center} \caption{ Comparison of average rank between Yeo-Johnson transformation methods based on either residual error (without outliers) or residual error of the central portion -(with outliers; $\kappa = 0.80$) over 3 datasets with 16001 sequences each. Each sequence contained 30 samples or fewer. -The clean dataset consists of sequences derived through inverse Yeo-Johnson transformation -of data sampled from a standard normal distribution. The dirty dataset contains sequences sampled from asymmetric generalised normal distributions, centred at 0. -The shifted dataset also contains sequences sampled from asymmetric generalised normal distributions, but centred at 100, and scaled by 0.001. +(with outliers; $\kappa = 0.80$) over 3 datasets with 16001 features each. Each feature contained 30 samples or fewer. +The clean dataset consists of features derived through inverse Yeo-Johnson transformation +of data sampled from a standard normal distribution. The dirty dataset contains features sampled from asymmetric generalised normal distributions, centred at 0. +The shifted dataset also contains features sampled from asymmetric generalised normal distributions, but centred at 1000. Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. A rank of 1 is the best and a rank of 9 the worst. For each dataset, the best ranking transformation is marked in bold. } @@ -788,15 +787,15 @@ \subsection{Ranking Yeo-Johnson transformations: effect of sequence \midrule -none & & 7.75 & 6.81 & 7.27 & 6.40 & 7.45 & 6.60 \\ -conventional & & 4.19 & 5.47 & 4.48 & 5.78 & 6.55 & 5.94 \\ -conventional (z-score norm.) & & 4.36 & 5.12 & 5.01 & 4.99 & 4.35 & 4.70 \\ -conventional (robust scaling) & & \textbf{4.05} & 5.63 & 4.38 & 5.49 & 3.76 & 5.20 \\ -Raymaekers-Rousseeuw & & 5.36 & 4.19 & 5.14 & 4.62 & 6.32 & 5.83 \\ -Raymaekers-Rousseeuw (z-score norm.) & & 5.56 & 4.08 & 5.67 & 4.05 & 5.03 & 3.81 \\ -Raymaekers-Rousseeuw (robust scaling) & & 5.33 & 4.14 & 5.15 & 4.15 & 4.55 & 3.94 \\ -invariant & & 4.11 & 5.92 & 4.12 & 5.67 & 3.62 & 5.37 \\ -robust invariant & & 4.28 & \textbf{3.65} & \textbf{3.79} & \textbf{3.85} & \textbf{3.37} & \textbf{3.61} \\ +none & & 7.75 & 6.81 & 7.27 & 6.40 & 7.44 & 6.59 \\ +conventional & & 4.19 & 5.47 & 4.48 & 5.78 & 6.35 & 6.00 \\ +conventional (z-score norm.) & & 4.36 & 5.12 & 5.01 & 4.99 & 4.36 & 4.71 \\ +conventional (robust scaling) & & \textbf{4.05} & 5.63 & 4.38 & 5.49 & 3.77 & 5.21 \\ +Raymaekers-Rousseeuw & & 5.36 & 4.19 & 5.14 & 4.62 & 6.47 & 5.72 \\ +Raymaekers-Rousseeuw (z-score norm.) & & 5.56 & 4.08 & 5.67 & 4.05 & 5.04 & 3.82 \\ +Raymaekers-Rousseeuw (robust scaling) & & 5.33 & 4.14 & 5.15 & 4.15 & 4.56 & 3.95 \\ +invariant & & 4.11 & 5.92 & 4.12 & 5.67 & 3.64 & 5.38 \\ +robust invariant & & 4.28 & \textbf{3.65} & \textbf{3.79} & \textbf{3.85} & \textbf{3.37} & \textbf{3.62} \\ \bottomrule \end{tabular} @@ -807,12 +806,12 @@ \subsection{Ranking Yeo-Johnson transformations: effect of sequence \begin{center} \caption{ Comparison of average normalised residual error between Yeo-Johnson transformation methods based on either the residual error (without outliers) or the residual error of the central portion -(with outliers; $\kappa = 0.80$) over 3 datasets with 16001 sequences each. Residual errors were normalised by sequence length to reduce the effect of sequence length on the residual error. -Each sequence contained 30 samples or fewer. The clean dataset consists of sequences derived through inverse Yeo-Johnson transformation -of data sampled from a standard normal distribution. The dirty dataset contains sequences sampled from asymmetric generalised normal distributions, centred at 0. -The shifted dataset also contains sequences sampled from asymmetric generalised normal distributions, but centred at 100, and scaled by 0.001. +(with outliers; $\kappa = 0.80$) over 3 datasets with 16001 features each. Residual errors were normalised by feature length to reduce the effect of feature length on the residual error. +Each feature contained 30 samples or fewer. The clean dataset consists of features derived through inverse Yeo-Johnson transformation +of data sampled from a standard normal distribution. The dirty dataset contains features sampled from asymmetric generalised normal distributions, centred at 0. +The shifted dataset also contains features sampled from asymmetric generalised normal distributions, but centred at 1000. Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. -A lower normalised residual error is better. Discrepancies with the rank-based method may occur, as even with normalisation residual errors +A lower normalised residual error is better. Discrepancies with the rank-based method may occur, as even with normalisation residual errors are not fully comparable between experiments, whereas the ranks achieved within each experiment are. } \label{tab:comparison_methods_simulations_yeo_johnson_10_30_avg_residual} @@ -825,10 +824,10 @@ \subsection{Ranking Yeo-Johnson transformations: effect of sequence \midrule none & & 0.246 & 0.173 & 0.186 & 0.156 & 0.187 & 0.157 \\ -conventional & & 0.139 & 0.142 & 0.142 & 0.132 & 0.187 & 0.157 \\ +conventional & & 0.139 & 0.142 & 0.142 & 0.132 & 0.184 & 0.155 \\ conventional (z-score norm.) & & 0.139 & 0.142 & 0.142 & 0.130 & 0.142 & 0.130 \\ conventional (robust scaling) & & 0.139 & 0.142 & 0.141 & 0.130 & 0.141 & 0.130 \\ -Raymaekers-Rousseeuw & & 0.211 & 0.151 & 0.183 & 1.515 & 0.187 & 0.157 \\ +Raymaekers-Rousseeuw & & 0.211 & 0.151 & 0.183 & 1.515 & 0.185 & 0.155 \\ Raymaekers-Rousseeuw (z-score norm.) & & 0.295 & 0.161 & 0.175 & 0.156 & 0.175 & 0.156 \\ Raymaekers-Rousseeuw (robust scaling) & & 0.209 & 0.155 & 0.172 & 0.144 & 0.172 & 0.144 \\ invariant & & 0.141 & 0.144 & 0.140 & 0.132 & 0.140 & 0.132 \\ @@ -843,10 +842,10 @@ \subsection{Ranking Yeo-Johnson transformations: effect of sequence \begin{center} \caption{ Comparison of average rank between Yeo-Johnson transformation methods based on either residual error (without outliers) or residual error of the central portion -(with outliers; $\kappa = 0.80$) over 3 datasets with 33037 sequences each. Each sequence contained between 1000 and 10000 samples. -The clean dataset consists of sequences derived through inverse Yeo-Johnson transformation -of data sampled from a standard normal distribution. The dirty dataset contains sequences sampled from asymmetric generalised normal distributions, centred at 0. -The shifted dataset also contains sequences sampled from asymmetric generalised normal distributions, but centred at 100, and scaled by 0.001. +(with outliers; $\kappa = 0.80$) over 3 datasets with 33037 features each. Each feature contained between 1000 and 10000 samples. +The clean dataset consists of features derived through inverse Yeo-Johnson transformation +of data sampled from a standard normal distribution. The dirty dataset contains features sampled from asymmetric generalised normal distributions, centred at 0. +The shifted dataset also contains features sampled from asymmetric generalised normal distributions, but centred at 1000. Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. A rank of 1 is the best and a rank of 9 the worst. For each dataset, the best ranking transformation is marked in bold. } @@ -859,15 +858,15 @@ \subsection{Ranking Yeo-Johnson transformations: effect of sequence \midrule -none & & 8.85 & 8.14 & 7.35 & 6.29 & 7.41 & 6.87 \\ -conventional & & 3.50 & 5.65 & 3.52 & 7.78 & 6.42 & 6.05 \\ -conventional (z-score norm.) & & 5.24 & 5.94 & 7.34 & 4.98 & 6.26 & 4.79 \\ -conventional (robust scaling) & & 3.53 & 6.98 & 5.97 & 5.79 & 4.87 & 5.65 \\ -Raymaekers-Rousseeuw & & 4.15 & 2.81 & 3.37 & 3.72 & 6.19 & 5.79 \\ -Raymaekers-Rousseeuw (z-score norm.) & & 5.34 & \textbf{1.86} & 6.57 & 3.39 & 5.49 & 3.30 \\ -Raymaekers-Rousseeuw (robust scaling) & & 4.20 & 2.33 & 5.21 & 3.36 & 4.12 & 3.23 \\ -invariant & & \textbf{2.36} & 7.59 & \textbf{2.66} & 6.82 & \textbf{1.98} & 6.81 \\ -robust invariant & & 7.83 & 3.70 & 3.01 & \textbf{2.88} & 2.26 & \textbf{2.51} \\ +none & & 8.85 & 8.14 & 7.35 & 6.29 & 7.40 & 6.86 \\ +conventional & & 3.50 & 5.65 & 3.52 & 7.78 & 6.24 & 6.45 \\ +conventional (z-score norm.) & & 5.24 & 5.94 & 7.34 & 4.98 & 6.29 & 4.92 \\ +conventional (robust scaling) & & 3.53 & 6.98 & 5.97 & 5.79 & 4.90 & 5.79 \\ +Raymaekers-Rousseeuw & & 4.15 & 2.81 & 3.37 & 3.72 & 6.22 & 5.04 \\ +Raymaekers-Rousseeuw (z-score norm.) & & 5.34 & \textbf{1.86} & 6.57 & 3.39 & 5.52 & 3.35 \\ +Raymaekers-Rousseeuw (robust scaling) & & 4.20 & 2.33 & 5.21 & 3.36 & 4.16 & 3.27 \\ +invariant & & \textbf{2.36} & 7.59 & \textbf{2.66} & 6.82 & \textbf{1.99} & 6.82 \\ +robust invariant & & 7.83 & 3.70 & 3.01 & \textbf{2.88} & 2.27 & \textbf{2.52} \\ \bottomrule \end{tabular} @@ -878,12 +877,12 @@ \subsection{Ranking Yeo-Johnson transformations: effect of sequence \begin{center} \caption{ Comparison of average normalised residual error between Yeo-Johnson transformation methods based on either the residual error (without outliers) or the residual error of the central portion -(with outliers; $\kappa = 0.80$) over 3 datasets with 16001 sequences each. Residual errors were normalised by sequence length to reduce the effect of sequence length on the residual error. -Each sequence contained between 1000 and 10000 samples. The clean dataset consists of sequences derived through inverse Yeo-Johnson transformation -of data sampled from a standard normal distribution. The dirty dataset contains sequences sampled from asymmetric generalised normal distributions, centred at 0. -The shifted dataset also contains sequences sampled from asymmetric generalised normal distributions, but centred at 100, and scaled by 0.001. +(with outliers; $\kappa = 0.80$) over 3 datasets with 16001 features each. Residual errors were normalised by feature length to reduce the effect of feature length on the residual error. +Each feature contained between 1000 and 10000 samples. The clean dataset consists of features derived through inverse Yeo-Johnson transformation +of data sampled from a standard normal distribution. The dirty dataset contains features sampled from asymmetric generalised normal distributions, centred at 0. +The shifted dataset also contains features sampled from asymmetric generalised normal distributions, but centred at 1000. Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. -A lower normalised residual error is better. Discrepancies with the rank-based method may occur, as even with normalisation residual errors +A lower normalised residual error is better. Discrepancies with the rank-based method may occur, as even with normalisation residual errors are not fully comparable between experiments, whereas the ranks achieved within each experiment are. } \label{tab:comparison_methods_simulations_yeo_johnson_1000_10000_avg_residual} @@ -896,10 +895,10 @@ \subsection{Ranking Yeo-Johnson transformations: effect of sequence \midrule none & & 0.186 & 0.091 & 0.106 & 0.053 & 0.106 & 0.053 \\ -conventional & & 0.014 & 0.076 & 0.060 & 0.057 & 0.106 & 0.053 \\ +conventional & & 0.014 & 0.076 & 0.060 & 0.057 & 0.103 & 0.053 \\ conventional (z-score norm.) & & 0.021 & 0.076 & 0.070 & 0.050 & 0.070 & 0.050 \\ conventional (robust scaling) & & 0.014 & 0.076 & 0.069 & 0.050 & 0.069 & 0.050 \\ -Raymaekers-Rousseeuw & & 0.015 & 0.021 & 0.060 & 0.030 & 0.106 & 0.053 \\ +Raymaekers-Rousseeuw & & 0.015 & 0.021 & 0.060 & 0.030 & 0.103 & 0.051 \\ Raymaekers-Rousseeuw (z-score norm.) & & 0.021 & 0.020 & 0.070 & 0.030 & 0.070 & 0.030 \\ Raymaekers-Rousseeuw (robust scaling) & & 0.015 & 0.020 & 0.068 & 0.030 & 0.068 & 0.030 \\ invariant & & 0.013 & 0.084 & 0.059 & 0.055 & 0.059 & 0.055 \\ @@ -915,18 +914,18 @@ \subsection{Ranking Yeo-Johnson transformations: effect of sequence \subsection{Examples using clean data}\label{examples-using-clean-data} Robust transformations are hypothesised to have a cost in efficiency for -data without outliers, i.e.~clean data. Here we draw nine sequences with +data without outliers, i.e.~clean data. Here we draw nine features with elements randomly drawn from a standard normal distribution \(\mathcal{N}(0,1)\). Subsequently, we perform an inverse transformation \(\left(\phi^{\lambda, 0, 1}\right)^{-1}\), with \(\lambda \in \left(0, 2\right)\). -The sequences drawn resulted from the permutations of the number of -elements of each sequence (\(n \in \{30, 100, 500\}\)) and -transformation parameter for the inverse transformation -\(\lambda \in \{0.1, 1.0, 1.9\}\). Each sequence then underwent power +The features drawn resulted from the permutations of the number of +elements of each feature (\(n \in \{30, 100, 500\}\)) and transformation +parameter for the inverse transformation +\(\lambda \in \{0.1, 1.0, 1.9\}\). Each feature then underwent power transformation using Box-Cox and Yeo-Johnson transformations. For -Box-Cox transformations, each sequence was shifted prior to inverse +Box-Cox transformations, each feature was shifted prior to inverse transformation ensuring all elements were strictly positive after inverse transformation. @@ -1142,8 +1141,8 @@ \subsection{Yeo-Johnson \begin{table} \begin{center} -\caption{Residual errors for features from real-world datasets after Yeo-Johnson transformation to normality. -Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. +\caption{Residual errors for features from real-world datasets after Yeo-Johnson transformation to normality. +Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. AWT: arterial wall thickness; FE: fuel efficiency; PBM: penguin body mass} \label{tab:experimental-results-appendix-residuals} \begin{tabular}{l | r r r r r} @@ -1169,8 +1168,8 @@ \subsection{Yeo-Johnson \begin{table} \begin{center} -\caption{Transformation parameter $\lambda$ for features from real-world datasets after Yeo-Johnson transformation to normality. -Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. +\caption{Transformation parameter $\lambda$ for features from real-world datasets after Yeo-Johnson transformation to normality. +Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. AWT: arterial wall thickness; FE: fuel efficiency; PBM: penguin body mass} \label{tab:experimental-results-appendix-lambda} \begin{tabular}{l | r r r r r} @@ -1196,7 +1195,7 @@ \subsection{Yeo-Johnson \begin{center} \caption{ p-values of empirical central normality tests ($\kappa = 0.80$) for features from real-world datasets after Yeo-Johnson transformation to normality. -Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. +Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. AWT: arterial wall thickness; FE: fuel efficiency; PBM: penguin body mass} \label{tab:experimental-results-appendix-p-value} \begin{tabular}{l | r r r r r} @@ -1232,8 +1231,8 @@ \subsection{Box-Cox transformation}\label{box-cox-transformation} \begin{table} \begin{center} -\caption{Residual errors for features from real-world datasets after Box-Cox transformation to normality. -Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. +\caption{Residual errors for features from real-world datasets after Box-Cox transformation to normality. +Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. AWT: arterial wall thickness; FE: fuel efficiency; PBM: penguin body mass} \label{tab:experimental-results-appendix-residuals-bc} \begin{tabular}{l | r r r r r} @@ -1246,7 +1245,7 @@ \subsection{Box-Cox transformation}\label{box-cox-transformation} conventional & 11.5 & 33.5 & 55.7 & 318.9 & 32.2 \\ conventional (z-score norm.) & 12.9 & 44.5 & 58.8 & 305.3 & 27.3 \\ conventional (robust scaling) & 12.9 & 44.7 & 59.0 & 305.3 & 27.3 \\ -Raymaekers-Rousseeuw & 11.5 & 127.1 & 47.6 & 314.8 & 32.2 \\ +Raymaekers-Rousseeuw & 11.5 & 127.1 & 47.6 & 314.8 & 32.2 \\ Raymaekers-Rousseeuw (z-score norm.) & 13.1 & 54.8 & 127.5 & 510.2 & 23.6 \\ Raymaekers-Rousseeuw (robust scaling) & 13.1 & 100.2 & 44.9 & 423.2 & 23.6 \\ invariant & 11.6 & 28.0 & 48.4 & 311.8 & 27.3 \\ @@ -1259,8 +1258,8 @@ \subsection{Box-Cox transformation}\label{box-cox-transformation} \begin{table} \begin{center} -\caption{Transformation parameter $\lambda$ for features from real-world datasets after Box-Cox transformation to normality. -Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. +\caption{Transformation parameter $\lambda$ for features from real-world datasets after Box-Cox transformation to normality. +Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. AWT: arterial wall thickness; FE: fuel efficiency; PBM: penguin body mass} \label{tab:experimental-results-appendix-lambda-bc} \begin{tabular}{l | r r r r r} @@ -1288,7 +1287,7 @@ \subsection{Box-Cox transformation}\label{box-cox-transformation} \begin{center} \caption{ p-values of empirical central normality tests ($\kappa = 0.80$) for features from real-world datasets after Box-Cox transformation to normality. -Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. +Several transformation methods include normalisation before transformation, indicated by z-score normalisation (norm.) or robust scaling. AWT: arterial wall thickness; FE: fuel efficiency; PBM: penguin body mass} \label{tab:experimental-results-appendix-p-value-bc} \begin{tabular}{l | r r r r r} diff --git a/manuscript_rmarkdown/manuscript_files/figure-latex/decreased-normality-1.pdf b/manuscript_rmarkdown/manuscript_files/figure-latex/decreased-normality-1.pdf index 9f8fc85..dbe5259 100644 Binary files a/manuscript_rmarkdown/manuscript_files/figure-latex/decreased-normality-1.pdf and b/manuscript_rmarkdown/manuscript_files/figure-latex/decreased-normality-1.pdf differ diff --git a/manuscript_rmarkdown/manuscript_files/figure-latex/empirical-central-normality-examples-1.pdf b/manuscript_rmarkdown/manuscript_files/figure-latex/empirical-central-normality-examples-1.pdf index b7a3acd..dec53ce 100644 Binary files a/manuscript_rmarkdown/manuscript_files/figure-latex/empirical-central-normality-examples-1.pdf and b/manuscript_rmarkdown/manuscript_files/figure-latex/empirical-central-normality-examples-1.pdf differ diff --git a/manuscript_rmarkdown/manuscript_files/figure-latex/experimental-results-invariance-1.pdf b/manuscript_rmarkdown/manuscript_files/figure-latex/experimental-results-invariance-1.pdf index b74f0dd..1f03a38 100644 Binary files a/manuscript_rmarkdown/manuscript_files/figure-latex/experimental-results-invariance-1.pdf and b/manuscript_rmarkdown/manuscript_files/figure-latex/experimental-results-invariance-1.pdf differ diff --git a/manuscript_rmarkdown/manuscript_files/figure-latex/experimental-results-outlier-robustness-1.pdf b/manuscript_rmarkdown/manuscript_files/figure-latex/experimental-results-outlier-robustness-1.pdf index 77660b9..20d5fd6 100644 Binary files a/manuscript_rmarkdown/manuscript_files/figure-latex/experimental-results-outlier-robustness-1.pdf and b/manuscript_rmarkdown/manuscript_files/figure-latex/experimental-results-outlier-robustness-1.pdf differ diff --git a/manuscript_rmarkdown/manuscript_files/figure-latex/marginal-effect-plot-1.pdf b/manuscript_rmarkdown/manuscript_files/figure-latex/marginal-effect-plot-1.pdf index ccea072..7812a33 100644 Binary files a/manuscript_rmarkdown/manuscript_files/figure-latex/marginal-effect-plot-1.pdf and b/manuscript_rmarkdown/manuscript_files/figure-latex/marginal-effect-plot-1.pdf differ diff --git a/manuscript_rmarkdown/manuscript_files/figure-latex/shifted-distributions-1.pdf b/manuscript_rmarkdown/manuscript_files/figure-latex/shifted-distributions-1.pdf index fb1a091..81097e0 100644 Binary files a/manuscript_rmarkdown/manuscript_files/figure-latex/shifted-distributions-1.pdf and b/manuscript_rmarkdown/manuscript_files/figure-latex/shifted-distributions-1.pdf differ diff --git a/manuscript_rmarkdown/manuscript_files/figure-latex/weighting-functions-1.pdf b/manuscript_rmarkdown/manuscript_files/figure-latex/weighting-functions-1.pdf index 9dc1c3b..4309f27 100644 Binary files a/manuscript_rmarkdown/manuscript_files/figure-latex/weighting-functions-1.pdf and b/manuscript_rmarkdown/manuscript_files/figure-latex/weighting-functions-1.pdf differ diff --git a/manuscript_rmarkdown/refs.bib b/manuscript_rmarkdown/refs.bib index d1dc70a..d69530e 100644 --- a/manuscript_rmarkdown/refs.bib +++ b/manuscript_rmarkdown/refs.bib @@ -453,3 +453,46 @@ @ARTICLE{Croux1992-zz doi = "10.1080/03610929208830889", issn = "0361-0926,1532-415X" } + +@INCOLLECTION{van-der-Vaart1998-xu, + title = "Quantiles and Order Statistics", + author = "van der Vaart, A W", + booktitle = "Asymptotic Statistics", + publisher = "Cambridge University Press", + address = "Cambridge", + pages = "304--315", + month = oct, + year = 1998, + doi = "10.1017/cbo9780511802256.022", + isbn = "9780521496032,9780521784504" +} + + +@ARTICLE{Huber1964-xk, + title = "Robust estimation of a location parameter", + author = "Huber, Peter J", + journal = "Ann. Math. Stat.", + publisher = "Institute of Mathematical Statistics", + volume = 35, + number = 1, + pages = "73--101", + month = mar, + year = 1964, + doi = "10.1214/aoms/1177703732", + issn = "0003-4851,2168-8990" +} + + +@ARTICLE{Rousseeuw1994-ur, + title = "The bias of k-step {M}-estimators", + author = "Rousseeuw, Peter J and Croux, Christophe", + journal = "Stat. Probab. Lett.", + publisher = "Elsevier BV", + volume = 20, + number = 5, + pages = "411--420", + month = aug, + year = 1994, + doi = "10.1016/0167-7152(94)90133-3", + issn = "0167-7152,1879-2103" +} diff --git a/manuscript_rmarkdown/simulated_data_dirty_shifted_box_cox.RDS b/manuscript_rmarkdown/simulated_data_dirty_shifted_box_cox.RDS index 22fdc99..ba85d50 100644 Binary files a/manuscript_rmarkdown/simulated_data_dirty_shifted_box_cox.RDS and b/manuscript_rmarkdown/simulated_data_dirty_shifted_box_cox.RDS differ diff --git a/manuscript_rmarkdown/simulated_data_dirty_shifted_box_cox_R2.RDS b/manuscript_rmarkdown/simulated_data_dirty_shifted_box_cox_R2.RDS new file mode 100644 index 0000000..22fdc99 Binary files /dev/null and b/manuscript_rmarkdown/simulated_data_dirty_shifted_box_cox_R2.RDS differ diff --git a/manuscript_rmarkdown/simulated_data_dirty_shifted_yeo_johnson.RDS b/manuscript_rmarkdown/simulated_data_dirty_shifted_yeo_johnson.RDS index f2f174e..ec8e0ef 100644 Binary files a/manuscript_rmarkdown/simulated_data_dirty_shifted_yeo_johnson.RDS and b/manuscript_rmarkdown/simulated_data_dirty_shifted_yeo_johnson.RDS differ diff --git a/manuscript_rmarkdown/simulated_data_dirty_shifted_yeo_johnson_R2.RDS b/manuscript_rmarkdown/simulated_data_dirty_shifted_yeo_johnson_R2.RDS new file mode 100644 index 0000000..f2f174e Binary files /dev/null and b/manuscript_rmarkdown/simulated_data_dirty_shifted_yeo_johnson_R2.RDS differ diff --git a/manuscript_rmarkdown/simulations.R b/manuscript_rmarkdown/simulations.R index 1f57a20..0f94a82 100644 --- a/manuscript_rmarkdown/simulations.R +++ b/manuscript_rmarkdown/simulations.R @@ -1194,8 +1194,8 @@ # Dirty data that is shifted and scaled to be far from 0.0. x <- power.transform::ragn( n, - location = 100.0, - scale = 0.001 * 1 / sqrt(2), + location = 1000.0, + scale = 1 / sqrt(2), alpha = alpha, beta = beta ) @@ -1783,3 +1783,158 @@ value.var = "tau" )) } + + + +.assess_gc_theorem <- function(manuscript_dir) { + n_rep <- 100L # number of distributions drawn. + # n_rep <- 10L # number of distributions drawn. + kappa <- c(0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 1.00) # central portion of distribution. + + # From 5 to 10000000 in equal steps (log 10 scale) + n <- unique(floor(10^(seq(from = 0.75, to = 7.0, by = 0.25)))) + + # Wrapper for multi-processing. + .compute_wrapper <- function( + n, + n_rep, + kappa + ) { + return( + lapply( + X = n, + FUN = ..assess_gc_theorem, + n_rep = n_rep, + kappa = kappa + ) + ) + } + + file_name <- file.path(manuscript_dir, "gc_theorem_data.RDS") + + if (!file.exists(file_name)) { + n_threads <- 18L + assignment_id <- rep_len(seq_len(n_threads), length(n)) + + cl <- parallel::makeCluster(n_threads) + on.exit(parallel::stopCluster(cl)) + + parallel::clusterExport(cl = cl, "..assess_gc_theorem") + + # data <- lapply( + # X = split(n, assignment_id), + # FUN = .compute_wrapper, + # n_rep = n_rep, + # kappa = kappa + # ) + + data <- parallel::parLapply( + cl = cl, + X = split(n, assignment_id), + fun = .compute_wrapper, + n_rep = n_rep, + kappa = kappa + ) + + # Aggregate to single table. + data <- data.table::rbindlist(unlist(data, recursive = FALSE)) + data <- data[order(n, kappa, sum_residual_error)] + + # Store + saveRDS(data, file_name) + + } else { + data <- readRDS(file_name) + } + + return(data) +} + + +..assess_gc_theorem <- function(n, n_rep, kappa) { + set.seed(19L + n) + + data <- list() + result_index <- 1L + + # Repeat n_rep times. + for (jj in seq_len(n_rep)) { + # Standard normal distribution -- we will only assess the central portion + # kappa + x <- power.transform::ragn( + floor(n), + location = 0.0, + scale = 1.0 / sqrt(2.0), + alpha = 0.5, + beta = 2.0 + ) + + # Sort in ascending order. + x <- sort(x) + + # Compute the expected z-score. + z_expected <- power.transform:::compute_expected_z(x = x) + + # Compute mean residual error for all kappa. + sum_residual_error <- numeric(length(kappa)) + robust_mu <- numeric(length(kappa)) + robust_sigma <- numeric(length(kappa)) + + for (kk in seq_along(kappa)) { + p_lower <- 0.50 - kappa[kk] / 2.0 + p_upper <- 0.50 + kappa[kk] / 2.0 + + p_observed <- (seq_along(x) - 1.0 / 3.0) / (length(x) + 1.0 / 3.0) + + # Use local copy of x. + x_c <- x + + # Add in random values for x outside p_lower and p_upper + lower_tail <- p_observed < p_lower + if (sum(lower_tail) > 0L) { + lower_range <- max(x[lower_tail]) + x_c[lower_tail] <- sort(stats::runif(sum(lower_tail), min = lower_range - 20.0, max = lower_range)) + } + + upper_tail <- p_observed > p_upper + if (sum(upper_tail) > 0L) { + upper_range <- min(x[upper_tail]) + x_c[upper_tail] <- sort(stats::runif(sum(upper_tail), min = upper_range, max = upper_range + 20.0)) + } + + # Compute M-estimates for locality and scale + cutoff_k <- stats::qnorm(p_upper) + if (is.infinite(cutoff_k)) cutoff_k <- 9.0 + robust_estimates <- power.transform::huber_estimate(x, k = cutoff_k, tol = 1E-3) + if (robust_estimates$sigma == 0.0) robust_estimates$sigma <- 1.0 + + # Compute the observed z-score. + z_observed <- (x_c - robust_estimates$mu) / robust_estimates$sigma + + # Compute residuals. + residual_data <- data.table::data.table( + "z_expected" = z_expected, + "z_observed" = z_observed, + "residual" = z_observed - z_expected, + "p_observed" = p_observed + ) + + sum_residual_error[kk] <- sum(abs(residual_data[p_observed >= p_lower & p_observed <= p_upper]$residual)) + robust_mu[kk] <- robust_estimates$mu + robust_sigma[kk] <- robust_estimates$sigma + } + + # Set data. + data[[result_index]] <- list( + "n" = n, + "kappa" = kappa, + "sum_residual_error" = sum_residual_error, + "robust_mu" = robust_mu, + "robust_sigma" = robust_sigma + ) + result_index <- result_index + 1L + } + + data <- data.table::rbindlist(data) + return(data) +}