diff --git a/DESCRIPTION b/DESCRIPTION index 2d27bf2f..eba0aaeb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -47,6 +47,7 @@ Suggests: rmarkdown, testthat (>= 3.0.0), tidyverse, + mvtnorm, withr VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index 2b4a4878..fe6d1480 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,10 @@ export(SimPed) export(addPersonToPed) +export(buildFamilyGroups) +export(buildOneFamilyGroup) +export(buildPedigreeModelCovariance) +export(buildPedigreeMx) export(buildTreeGrid) export(calcAllGens) export(calcFamilySize) @@ -18,6 +22,7 @@ export(createGenDataFrame) export(determineSex) export(dropLink) export(fitComponentModel) +export(fitPedigreeModel) export(getWikiTreeSummary) export(identifyComponentModel) export(inferRelatedness) diff --git a/NEWS.md b/NEWS.md index 92292f4a..402a54f1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,10 +1,15 @@ # BGmisc NEWS # Development version: 1.6.0.9000 +Add OpenMx pedigree model builders and docs +Added vignette for OpenMx pedigree model builders +Add option for MZ twins in the additive genetic matrix +Add option to select sex for MZ twin generation. +Add option to tweak pedigree with one id provided # BGmisc 1.6.0.1 * Add helper functions for checkParents etc * fixed incorrect direction so that parents are pointing to children in the graphs -* Optimize simulatePedigree and helpers for speed and memory usage +* Optimize simulatePedigree and helpers for speed and memory usage * Major gains in speed for deeper pedigrees * Added more tests for simulatePedigree * Fix error when not enough single people available diff --git a/R/buildComponent.R b/R/buildComponent.R index 38a96522..9c86a8fa 100644 --- a/R/buildComponent.R +++ b/R/buildComponent.R @@ -19,6 +19,9 @@ #' @param isChild_method character. The method to use for computing the isChild matrix. Options are "classic" or "partialparent" #' @param adjBeta_method numeric The method to use for computing the building the adjacency_method matrix when using the "beta" build #' @param compress logical. If TRUE, use compression when saving the checkpoint files. Defaults to TRUE. +#' @param mz_twins logical. If TRUE, merge MZ co-twin columns in the r2 matrix before tcrossprod so that MZ twins are coded with relatedness 1 instead of 0.5. Twin pairs are identified from the \code{twinID} column. When a \code{zygosity} column is also present, only pairs where both members have \code{zygosity == "MZ"} are used; otherwise all \code{twinID} pairs are assumed to be MZ. Defaults to FALSE. +#' @param mz_method character. The method to handle MZ twins. Options are "merging" (default) or "addtwins". "addtwins" adds the twin2 column to the twin1 column before tcrossprod so that all relatedness flows through a single source, then leaves the twin2 column as zero and relies on the fact that the row/col names are the same to copy the values back to twin2 after tcrossprod. "merging" merges the twin2 column into the twin1 column before tcrossprod and then copies the values back to twin2 after tcrossprod so that both twins appear in the final matrix. +#' @param beta logical. Used for benchmarking #' @param ... additional arguments to be passed to \code{\link{ped2com}} #' @details The algorithms and methodologies used in this function are further discussed and exemplified in the vignette titled "examplePedigreeFunctions". For more advanced scenarios and detailed explanations, consult this vignette. #' @export @@ -32,7 +35,7 @@ ped2com <- function(ped, component, standardize_colnames = TRUE, transpose_method = "tcrossprod", adjacency_method = "direct", - isChild_method = "classic", + isChild_method = "partialparent", saveable = FALSE, resume = FALSE, save_rate = 5, @@ -42,6 +45,9 @@ ped2com <- function(ped, component, save_path = "checkpoint/", adjBeta_method = NULL, compress = TRUE, + mz_twins = TRUE, + mz_method = "addtwins", + beta = FALSE, ...) { #------ # Check inputs @@ -121,12 +127,37 @@ ped2com <- function(ped, component, ped <- standardizeColnames(ped, verbose = config$verbose) } + mz_row_pairs <- NULL + mz_id_pairs <- NULL + + if (mz_twins == TRUE && "twinID" %in% colnames(ped)) { + df_mz <- findMZtwins(ped, + verbose = config$verbose, + returnIDs = TRUE, + returnRows = TRUE, + returnAsList = TRUE, + beta = beta + ) + mz_row_pairs <- df_mz$pair_rows + mz_id_pairs <- df_mz$pair_ids + } + + # Load final result if computation was completed if (config$resume == TRUE && file.exists(checkpoint_files$final_matrix)) { if (config$verbose == TRUE) cat("Loading final computed matrix...\n") return(readRDS(checkpoint_files$final_matrix)) } + if (mz_method %in% c("merging") && mz_twins == TRUE && !is.null(mz_row_pairs) && length(mz_row_pairs) > 0 && + config$component %in% c("additive")) { + # replace all MZ twin IDs with the first twin's ID in each pair so they are merged for the path tracing and all subsequent steps. We will copy the values back to the second twin at the end. + ped <- fuseTwins(ped = ped, mz_row_pairs = mz_row_pairs, mz_id_pairs = mz_id_pairs, config = config, beta = beta) + if (config$verbose == TRUE) { + message("Merged ", length(mz_row_pairs), " MZ twin pair(s) in pedigree dataset for path tracing") + } + } + #------ # Algorithm @@ -141,6 +172,7 @@ ped2com <- function(ped, component, cat(paste0("Family Size = ", config$nr, "\n")) } + # # Step 1: Construct parent-child adjacency matrix ## A. Resume from Checkpoint if Needed @@ -195,6 +227,8 @@ ped2com <- function(ped, component, config = config, compress = config$compress ) + + # TODO merge twin columns # --- Step 2: Compute Relatedness Matrix --- @@ -226,7 +260,7 @@ ped2com <- function(ped, component, # r is I + A + A^2 + ... = (I-A)^-1 from RAM # could trim, here ## it keeps going until it explains all of the relatedness with themselves (i.e., mtSum == 0) - # some of this precision is articifuial because we literally get to the point that the condon is eaither there or not. probabiliticy + # some of this precision is artificial because we literally get to the point that the condon is eaither there or not. probabiliticy # how much percision do we need to get unbiased estimates @@ -281,6 +315,29 @@ ped2com <- function(ped, component, compress = config$compress ) + if (mz_method == "addtwins" && mz_twins == TRUE && !is.null(mz_row_pairs) && length(mz_row_pairs) > 0) { + if (config$verbose == TRUE) { + message("MZ twin merging enabled: Will merge MZ twin columns in r2 before tcrossprod") + } + + # --- Step 3b: Add --- + # MZ twins share the same genetic source. We absorb twin2's column into + # twin1's before tcrossprod so all path-traced relatedness flows through a + # single source. After tcrossprod we copy twin1's row/col back to twin2. + if (!is.null(mz_row_pairs) && length(mz_row_pairs) > 0 && config$component %in% c("additive")) { + # Extract all indices at once for batch operations + pairs_mat <- do.call(rbind, mz_row_pairs) + idx1_all <- pairs_mat[, 1] + idx2_all <- pairs_mat[, 2] + # Batch: absorb all twin2 columns into twin1 columns, then zero twin2 + r2[, idx1_all] <- r2[, idx1_all, drop = FALSE] + r2[, idx2_all, drop = FALSE] + r2[, idx2_all] <- 0 + + if (config$verbose == TRUE) { + message("Added ", length(mz_row_pairs), " MZ twin pair column(s) in r2") + } + } + } # --- Step 4: T crossproduct --- if (config$resume == TRUE && file.exists(checkpoint_files$tcrossprod_checkpoint) && @@ -300,12 +357,68 @@ ped2com <- function(ped, component, } } + if (mz_method %in% c("merging", "addtwins") && mz_twins == TRUE && config$component %in% c("additive") && !is.null(mz_row_pairs) && length(mz_row_pairs) > 0) { + # --- Step 4b: Restore MZ twins --- + # Copy twin1's row/col to twin2 so both twins appear in the final matrix. + if (config$sparse == FALSE) { + r <- as.matrix(r) + rnames <- rownames(r) + ids_mat <- do.call(rbind, mz_id_pairs) + idx1_all <- match(ids_mat[, 1], rnames) + idx2_all <- match(ids_mat[, 2], rnames) + # Batch copy: twin1 rows/cols -> twin2 rows/cols + r[idx2_all, ] <- r[idx1_all, ] + + r[, idx2_all] <- r[, idx1_all] + } else { + # TODO this is really slow. Can we do it without coercing to dense? Maybe by doing row/col replacement on the sparse matrix directly? Or by constructing a sparse matrix with the twin2 values and adding it to r? + # r <- df_add + + rnames <- r@Dimnames[[1]] + + ids_mat <- do.call(rbind, mz_id_pairs) + # needs to use sparse indexing to avoid coercion to dense + idx1_all <- match(ids_mat[, 1], rnames) + idx2_all <- match(ids_mat[, 2], rnames) + + twin1_rows <- r[idx1_all, , drop = FALSE] + twin1_cols <- r[, idx1_all, drop = FALSE] + twin1_rows@Dimnames[[1]] <- rnames[idx2_all] + twin1_cols@Dimnames[[2]] <- rnames[idx2_all] + twin1_self <- r[idx1_all, idx1_all, drop = FALSE] + twin1_self@Dimnames[[1]] <- rnames[idx2_all] + + r[idx2_all, ] <- twin1_rows + r[, idx2_all] <- twin1_cols + r[idx2_all, idx2_all] <- twin1_self + + # Batch copy: twin1 rows/cols -> twin2 rows/cols + + # Row/column replacement on a dsCMatrix (symmetric) causes Matrix to + # coerce to dgCMatrix (general), doubling stored entries. Convert back + + r <- Matrix::drop0(r) + + # so both mz_method paths return the same sparse class. + if (methods::is(r, "CsparseMatrix") && !methods::is(r, "symmetricMatrix")) { + r <- Matrix::forceSymmetric(r) + } + } + if (config$verbose == TRUE) { + message("Restored ", length(mz_row_pairs), " MZ twin pair(s) in relatedness matrix") + } + } + + if (config$component %in% c("mitochondrial", "mtdna", "mitochondria")) { r@x <- rep(1, length(r@x)) # Assign 1 to all nonzero elements for mitochondrial component } - if (config$sparse == FALSE) { + # Remove explicit zeros so that both mz_method paths produce + # structurally identical sparse matrices + + if (config$sparse == FALSE && !methods::is(r, "matrix")) { r <- as.matrix(r) } # flattens diagonal if you don't want to deal with inbreeding @@ -335,6 +448,8 @@ ped2add <- function(ped, max_gen = 25, sparse = TRUE, verbose = FALSE, save_rate_parlist = 100000 * save_rate, save_path = "checkpoint/", compress = TRUE, + mz_twins = FALSE, + mz_method = "addtwins", ...) { ped2com( ped = ped, @@ -353,6 +468,8 @@ ped2add <- function(ped, max_gen = 25, sparse = TRUE, verbose = FALSE, save_rate_parlist = save_rate_parlist, save_path = save_path, compress = compress, + mz_twins = mz_twins, + mz_method = mz_method, ... ) } diff --git a/R/buildmxPedigrees.R b/R/buildmxPedigrees.R new file mode 100644 index 00000000..e8d14e4b --- /dev/null +++ b/R/buildmxPedigrees.R @@ -0,0 +1,414 @@ +#' Create an mxModel for a pedigree +#' +#' This function builds an OpenMx model for a pedigree with specified variance components. It requires the OpenMx package. +#' +#' @param vars A named list or vector of initial variance component values. Names should include +#' ad2 (additive), dd2 (dominance), cn2 (common nuclear), ce2 (common extended), +#' mt2 (mitochondrial), am2 (additive-mitochondrial interaction), and ee2 (unique environment). +#' Default values are provided. +#' @param Vad Logical. Include additive genetic variance component. Default is TRUE. +#' @param Vdd Logical. Include dominance genetic variance component. Default is FALSE. +#' @param Vcn Logical. Include common nuclear family environment variance component. Default is TRUE. +#' @param Vce Logical. Include common extended family environment variance component. Default is TRUE. +#' @param Vmt Logical. Include mitochondrial genetic variance component. Default is TRUE. +#' @param Vam Logical. Include additive by mitochondrial interaction variance component. Default is FALSE. +#' @param Ver Logical. Include unique environmental variance component. Default is TRUE. +#' @return An OpenMx model representing the pedigree with specified variance components. +#' @export + +buildPedigreeModelCovariance <- function( + vars = list( + ad2 = 0.5, + dd2 = 0.3, + cn2 = 0.2, ce2 = 0.4, + mt2 = 0.1, + am2 = 0.25, + ee2 = 0.6 + ), + Vad = TRUE, + Vdd = FALSE, + Vcn = TRUE, + Vce = TRUE, + Vmt = TRUE, + Vam = FALSE, + Ver = TRUE +) { + if (!requireNamespace("OpenMx", quietly = TRUE)) { + stop("OpenMx package is required for buildPedigreeModelCovariance function. Please install it.") + } else { + library(OpenMx) + } + + # Coerce to list so both c() vectors and list() inputs work with [[ ]] + vars <- as.list(vars) + + # Build the list of mxMatrix components conditionally + mat_list <- list() + if (Vad) { + mat_list <- c(mat_list, list(mxMatrix( + type = "Full", nrow = 1, ncol = 1, free = TRUE, + values = vars[["ad2"]], labels = "vad", name = "Vad", lbound = 1e-10 + ))) + } + if (Vdd) { + mat_list <- c(mat_list, list(mxMatrix( + type = "Full", nrow = 1, ncol = 1, free = TRUE, + values = vars[["dd2"]], labels = "vdd", name = "Vdd", lbound = 1e-10 + ))) + } + if (Vcn) { + mat_list <- c(mat_list, list(mxMatrix( + type = "Full", nrow = 1, ncol = 1, free = TRUE, + values = vars[["cn2"]], labels = "vcn", name = "Vcn", lbound = 1e-10 + ))) + } + if (Vce) { + mat_list <- c(mat_list, list(mxMatrix( + type = "Full", nrow = 1, ncol = 1, free = TRUE, + values = vars[["ce2"]], labels = "vce", name = "Vce", lbound = 1e-10 + ))) + } + if (Vmt) { + mat_list <- c(mat_list, list(mxMatrix( + type = "Full", nrow = 1, ncol = 1, free = TRUE, + values = vars[["mt2"]], labels = "vmt", name = "Vmt", lbound = 1e-10 + ))) + } + if (Vam) { + mat_list <- c(mat_list, list(mxMatrix( + type = "Full", nrow = 1, ncol = 1, free = TRUE, + values = vars[["am2"]], labels = "vam", name = "Vam", lbound = 1e-10 + ))) + } + if (Ver) { + mat_list <- c(mat_list, list(mxMatrix( + type = "Full", nrow = 1, ncol = 1, free = TRUE, + values = vars[["ee2"]], labels = "ver", name = "Ver", lbound = 1e-10 + ))) + } + + do.call(mxModel, c(list("ModelOne"), mat_list)) +} + +#' Build one family group model +#' +#' This function constructs an OpenMx model for a single family group based on +#' provided relatedness matrices and observed data. The implied covariance +#' is built as a weighted sum of the supplied relatedness matrices, where +#' the weights are variance component parameters shared across groups via +#' a parent \code{ModelOne} sub-model. +#' +#' @param group_name Name of the family group. +#' @param Addmat Additive genetic relatedness matrix (from \code{\link{ped2add}}). +#' @param Nucmat Nuclear family shared environment relatedness matrix (from \code{\link{ped2cn}}). +#' @param Extmat Extended family shared environment indicator. When non-NULL, +#' a common-extended-environment term using a unit matrix is included. +#' @param Mtdmat Mitochondrial genetic relatedness matrix (from \code{\link{ped2mit}}). +#' @param Amimat Additive by mitochondrial interaction relatedness matrix. +#' @param Dmgmat Dominance genetic relatedness matrix. +#' @param full_df_row A 1-row matrix of observed data with column names matching \code{ytemp}. +#' @param ytemp A character vector of variable names corresponding to the observed data columns. +#' @return An OpenMx model for the specified family group. +#' @export + +buildOneFamilyGroup <- function( + group_name, + Addmat = NULL, + Nucmat = NULL, + Extmat = NULL, + Mtdmat = NULL, + Amimat = NULL, + Dmgmat = NULL, + full_df_row, + ytemp +) { + if (!requireNamespace("OpenMx", quietly = TRUE)) { + stop("OpenMx package is required for buildOneFamilyGroup function. Please install it.") + } else { + library(OpenMx) + } + + # Determine family size from first available matrix + fsize <- NULL + for (m in list(Addmat, Nucmat, Extmat, Mtdmat, Amimat, Dmgmat)) { + if (!is.null(m)) { + fsize <- nrow(m) + break + } + } + if (is.null(fsize)) stop("At least one relatedness matrix must be provided.") + + # ------------------------------------------------------------------ + # Build the list of mxMatrix objects and the algebra terms in lockstep + # so we never reference a matrix or variance component that doesn't exist. + # ------------------------------------------------------------------ + mat_list <- list( + mxMatrix("Iden", nrow = fsize, ncol = fsize, name = "I"), + mxMatrix("Unit", nrow = fsize, ncol = fsize, name = "U") + ) + + algebra_terms <- character(0) + + if (!is.null(Addmat)) { + mat_list <- c(mat_list, list( + mxMatrix("Symm", + nrow = fsize, ncol = fsize, + values = as.matrix(Addmat), name = "A" + ) + )) + algebra_terms <- c(algebra_terms, "(A %x% ModelOne.Vad)") + } + if (!is.null(Dmgmat)) { + mat_list <- c(mat_list, list( + mxMatrix("Symm", + nrow = fsize, ncol = fsize, + values = as.matrix(Dmgmat), name = "D" + ) + )) + algebra_terms <- c(algebra_terms, "(D %x% ModelOne.Vdd)") + } + if (!is.null(Nucmat)) { + mat_list <- c(mat_list, list( + mxMatrix("Symm", + nrow = fsize, ncol = fsize, + values = as.matrix(Nucmat), name = "Cn" + ) + )) + algebra_terms <- c(algebra_terms, "(Cn %x% ModelOne.Vcn)") + } + if (!is.null(Extmat)) { + # Extmat signals "include Vce"; the algebra always uses U (unit matrix) + algebra_terms <- c(algebra_terms, "(U %x% ModelOne.Vce)") + } + if (!is.null(Amimat)) { + mat_list <- c(mat_list, list( + mxMatrix("Symm", + nrow = fsize, ncol = fsize, + values = as.matrix(Amimat), name = "Am" + ) + )) + algebra_terms <- c(algebra_terms, "(Am %x% ModelOne.Vam)") + } + if (!is.null(Mtdmat)) { + mat_list <- c(mat_list, list( + mxMatrix("Symm", + nrow = fsize, ncol = fsize, + values = as.matrix(Mtdmat), name = "Mt" + ) + )) + algebra_terms <- c(algebra_terms, "(Mt %x% ModelOne.Vmt)") + } + + # Unique environment is always included + algebra_terms <- c(algebra_terms, "(I %x% ModelOne.Ver)") + + algebra_str <- paste(algebra_terms, collapse = " + ") + + # Assemble the model via do.call so that the dynamic mat_list is unpacked + model_args <- c( + list(name = group_name), + mat_list, + list( + mxData(observed = full_df_row, type = "raw", sort = FALSE), + mxMatrix("Full", + nrow = 1, ncol = fsize, name = "M", free = TRUE, + labels = "meanLI", dimnames = list(NULL, ytemp) + ), + mxAlgebraFromString(algebra_str, + name = "V", dimnames = list(ytemp, ytemp) + ), + mxExpectationNormal(covariance = "V", means = "M"), + mxFitFunctionML() + ) + ) + + do.call(mxModel, model_args) +} + +#' Build family group models +#' +#' This function constructs OpenMx models for multiple family groups based on +#' provided relatedness matrices and observed data. +#' +#' @param dat A data frame where each row represents a family group and columns correspond to observed variables. +#' @param ytemp A vector of variable names corresponding to the observed data. +#' @param Addmat Additive genetic relatedness matrix. +#' @param Nucmat Nuclear family shared environment relatedness matrix. +#' @param Extmat Extended family shared environment relatedness matrix. +#' @param Mtdmat Mitochondrial genetic relatedness matrix. +#' @param Amimat Additive by mitochondrial interaction relatedness matrix. +#' @param Dmgmat Dominance genetic relatedness matrix. +#' @param prefix A prefix for naming the family groups. Default is "fam". +#' @return A list of OpenMx models for each family group. +#' @export + +buildFamilyGroups <- function( + dat, ytemp, + Addmat = NULL, + Nucmat = NULL, + Extmat = NULL, + Mtdmat = NULL, + Amimat = NULL, + Dmgmat = NULL, + prefix = "fam" +) { + numfam <- nrow(dat) + groups <- vector("list", numfam) + + for (afam in seq_len(numfam)) { + full_df_row <- matrix(dat[afam, ], nrow = 1, dimnames = list(NULL, ytemp)) + groups[[afam]] <- buildOneFamilyGroup( + group_name = paste0(prefix, afam), + Addmat = Addmat, + Nucmat = Nucmat, + Extmat = Extmat, + Mtdmat = Mtdmat, + Amimat = Amimat, + Dmgmat = Dmgmat, + full_df_row = full_df_row, + ytemp = ytemp + ) + } + + groups +} + +#' Build Pedigree mxModel +#' +#' This function constructs an OpenMx pedigree model by combining variance +#' component parameters and family group models. It auto-detects which +#' variance components are referenced in the group algebras and creates +#' only those parameters. +#' +#' @param model_name Name of the overall pedigree model. +#' @param vars A named list or vector of initial variance component values. +#' @param group_models A list of OpenMx models for each family group. +#' @return An OpenMx pedigree model combining variance components and family groups. +#' @export + +buildPedigreeMx <- function(model_name, vars, group_models) { + if (!requireNamespace("OpenMx", quietly = TRUE)) { + stop("OpenMx package is required for buildPedigreeMx function. Please install it.") + } else { + library(OpenMx) + } + + group_names <- vapply(group_models, function(m) m$name, character(1)) + + # Auto-detect which variance components the group algebras reference + # by inspecting the algebra formula strings for ModelOne.V* patterns. + # This keeps the variance component sub-model in sync with the groups. + vc_map <- c( + Vad = "ModelOne.Vad", + Vdd = "ModelOne.Vdd", + Vcn = "ModelOne.Vcn", + Vce = "ModelOne.Vce", + Vmt = "ModelOne.Vmt", + Vam = "ModelOne.Vam", + Ver = "ModelOne.Ver" + ) + + # Collect all algebra formulas from group models + all_formulas <- vapply(group_models, function(m) { + if (!is.null(m$V) && !is.null(m$V$formula)) { + deparse(m$V$formula, width.cutoff = 500L) + } else { + "" + } + }, character(1)) + all_formulas <- paste(all_formulas, collapse = " ") + + flags <- lapply(vc_map, function(pat) grepl(pat, all_formulas, fixed = TRUE)) + + mxModel( + model_name, + buildPedigreeModelCovariance( + vars, + Vad = isTRUE(flags$Vad), + Vdd = isTRUE(flags$Vdd), + Vcn = isTRUE(flags$Vcn), + Vce = isTRUE(flags$Vce), + Vmt = isTRUE(flags$Vmt), + Vam = isTRUE(flags$Vam), + Ver = isTRUE(flags$Ver) + ), + group_models, + mxFitFunctionMultigroup(group_names) + ) +} + +#' Fit an OpenMx pedigree model to observed data +#' +#' This function constructs and fits an OpenMx model for a pedigree using +#' specified variance components and family group models. +#' +#' @param model_name Character. Name for the overall OpenMx model. Default is "PedigreeModel". +#' @param vars A named list or vector of initial variance component values. +#' @param data A matrix or data frame of observed data, where each row is a family +#' and columns correspond to individuals. Only used when \code{group_models} is NULL. +#' @param group_models Optional list of pre-built OpenMx family group models +#' (from \code{\link{buildOneFamilyGroup}}). If NULL, they are generated from \code{data} +#' using the provided relatedness matrices. +#' @param Addmat Additive genetic relatedness matrix. Required when \code{group_models} is NULL. +#' @param Nucmat Common nuclear environment relatedness matrix. Optional. +#' @param Extmat Common extended environment relatedness matrix. Optional. +#' @param Mtdmat Mitochondrial relatedness matrix. Optional. +#' @param Amimat Additive-by-mitochondrial interaction matrix. Optional. +#' @param Dmgmat Dominance genetic relatedness matrix. Optional. +#' @param tryhard Logical. If TRUE (default), use \code{mxTryHard} for robust optimization; +#' if FALSE, use \code{mxRun}. +#' @return A fitted OpenMx model. +#' @export + +fitPedigreeModel <- function( + model_name = "PedigreeModel", + vars = list( + ad2 = 0.5, + dd2 = 0.3, + cn2 = 0.2, ce2 = 0.4, + mt2 = 0.1, + am2 = 0.25, + ee2 = 0.6 + ), + data = NULL, + group_models = NULL, + tryhard = TRUE, + Addmat = NULL, + Nucmat = NULL, + Extmat = NULL, + Mtdmat = NULL, + Amimat = NULL, + Dmgmat = NULL +) { + if (!requireNamespace("OpenMx", quietly = TRUE)) { + stop("OpenMx package is required for fitPedigreeModel function. Please install it.") + } else { + library(OpenMx) + } + + if (is.null(group_models)) { + # generate them from data and relatedness matrices + if (is.null(data)) { + stop("Either 'group_models' or 'data' must be provided.") + } + ytemp <- colnames(data) + group_models <- buildFamilyGroups( + dat = data, + ytemp = ytemp, + Addmat = Addmat, + Nucmat = Nucmat, + Extmat = Extmat, + Mtdmat = Mtdmat, + Amimat = Amimat, + Dmgmat = Dmgmat + ) + } + + pedigree_model <- buildPedigreeMx(model_name, vars, group_models) + if (tryhard) { + fitted_model <- mxTryHard(pedigree_model, silent = TRUE, extraTries = 10, intervals = TRUE) + } else { + fitted_model <- mxRun(pedigree_model) + } + fitted_model +} diff --git a/R/checkIDs.R b/R/checkIDs.R index 05f104c1..c39110f3 100644 --- a/R/checkIDs.R +++ b/R/checkIDs.R @@ -57,9 +57,11 @@ checkIDs <- function(ped, verbose = FALSE, repair = FALSE) { if (length(validation_results$non_unique_ids) > 0) { # loop through each non-unique ID - processed <- dropIdenticalDuplicateIDs(ped = repaired_ped, + processed <- dropIdenticalDuplicateIDs( + ped = repaired_ped, ids = validation_results$non_unique_ids, - changes = changes) + changes = changes + ) repaired_ped <- processed$ped changes <- processed$changes } diff --git a/R/checkParents.R b/R/checkParents.R index 2abe3e24..d354954e 100644 --- a/R/checkParents.R +++ b/R/checkParents.R @@ -59,9 +59,6 @@ checkParentIDs <- function(ped, verbose = FALSE, repair = FALSE, validation_results$single_parents <- (length(missing_fathers) + length(missing_mothers)) > 0 - - - if (verbose && validation_results$single_parents) cat("Missing single parents found.\n") if (verbose && !validation_results$single_parents) cat("No missing single parents found.\n") @@ -269,12 +266,12 @@ checkParentIDs <- function(ped, verbose = FALSE, repair = FALSE, } # restore orginal names that the user orginally provided - ped <- restorePedColnames(ped, + ped <- restorePedColnames(ped, famID = famID, personID = personID, momID = momID, - dadID = dadID) - + dadID = dadID + ) } #' Repair Parent IDs #' diff --git a/R/constructAdjacency.R b/R/constructAdjacency.R index 3053fb5b..fe3f7bbc 100644 --- a/R/constructAdjacency.R +++ b/R/constructAdjacency.R @@ -70,7 +70,8 @@ .adjIndexed <- function(ped, component, saveable, resume, save_path, verbose, lastComputed, checkpoint_files, update_rate, - parList, lens, save_rate_parlist, config, compress = config$compress) { + parList, lens, save_rate_parlist, + config, compress = config$compress) { # Loop through each individual in the pedigree # Build the adjacency matrix for parent-child relationships # Is person in column j the parent of the person in row i? .5 for yes, 0 for no. @@ -207,7 +208,7 @@ return(list_of_adjacency) } -#' Construct Adjacency Matrix for Parent-Child Relationships Using Beta Method +#' Construct Adjacency Matrix for Parent-Child Relationships Using Beta Methods #' This function constructs an adjacency matrix for parent-child relationships #' using a method in beta testing. It identifies parent-child pairs based on the #' specified component of relatedness. diff --git a/R/helpChecks.R b/R/helpChecks.R index eb233bb3..fd6bf66a 100644 --- a/R/helpChecks.R +++ b/R/helpChecks.R @@ -7,8 +7,7 @@ #' @param ped A data frame representing the pedigree. #' @param ids A vector of IDs to check for duplicates in the pedigree. #' @param changes An optional list to log changes made during the process. -dropIdenticalDuplicateIDs <- function(ped, ids, changes = NULL -) { +dropIdenticalDuplicateIDs <- function(ped, ids, changes = NULL) { if (!is.data.frame(ped)) { stop("ped must be a data frame") } diff --git a/R/helpTwins.R b/R/helpTwins.R new file mode 100644 index 00000000..8a4cc58b --- /dev/null +++ b/R/helpTwins.R @@ -0,0 +1,275 @@ +#' Determine isTwin Status +#' @param ped pedigree data frame +#' @return A logical vector indicating, for each row of \code{ped}, whether +#' \code{twinID} is non-\code{NA}. +#' @keywords internal + +isTwin <- function(ped) { + is_twin <- !is.na(ped[["twinID"]]) + is_twin +} +#' Find MZ twin pair_rows in a pedigree +#' +#' Identifies MZ twin pair_rows from the \code{twinID} column and returns their +#' row indices. These indices are used later to merge the twins' columns in +#' the \code{r2} matrix before \code{tcrossprod}, which correctly produces +#' relatedness 1 between MZ co-twins with no diagonal or downstream artifacts. +#' +#' @param ped A pedigree data.frame with columns \code{ID} and \code{twinID}. +#' Optionally a \code{zygosity} column; when present only pair_rows where both +#' members have \code{zygosity == "MZ"} are used. +#' @param verbose logical. If TRUE, print progress messages. +#' @param returnIDs logical. If TRUE, return the IDs of the twin pair_rows instead of row indices. +#' @param returnRows logical. If TRUE, return the row indices of the twin pair_rows instead of IDs. +#' @param returnAsList logical. If TRUE, return results as a list of vectors +#' (default). If FALSE, return results as a data.frame with separate columns for each twin's ID and row index. +#' @param beta logical. If TRUE, use an optimized approach with O(1) lookups for large pedigrees. If FALSE (default), use a simpler approach that may be less efficient for large pedigrees. +#' @return A list of length-2 integer vectors \code{c(idx1, idx2)} giving the +#' row indices of each MZ pair in the pedigree, or \code{NULL} if none found. +#' @keywords internal +findMZtwins <- function(ped, verbose = FALSE, returnRows = TRUE, + returnIDs = FALSE, returnAsList = TRUE, + beta = FALSE) { + if (!"twinID" %in% colnames(ped)) { + return(NULL) + } + + twin_rows <- which(!is.na(ped$twinID)) + + # If zygosity column exists, restrict to MZ pair_rows + if ("zygosity" %in% colnames(ped)) { + twin_rows <- twin_rows[!is.na(ped$zygosity[twin_rows]) & + ped$zygosity[twin_rows] %in% c("mz", "MZ")] + } + + if (length(twin_rows) == 0) { + return(NULL) + } + + # Build ID-to-row lookup for O(1) resolution instead of which() per pair + id_to_row <- seq_len(nrow(ped)) + names(id_to_row) <- as.character(ped$ID) + + # Use environment as hash set for O(1) membership checks + processed <- new.env(hash = TRUE, parent = emptyenv()) + + pair_rows <- vector("list", length(twin_rows)) + if (returnIDs) { + pair_ids <- vector("list", length(twin_rows)) + } + n_pairs <- 0L + + for (idx in twin_rows) { + twin_id <- ped$ID[idx] + co_twin_id <- ped$twinID[idx] + + twin_id_chr <- as.character(twin_id) + co_twin_id_chr <- as.character(co_twin_id) + + # Skip if already processed this pair (O(1) lookup) + if (exists(twin_id_chr, envir = processed, inherits = FALSE) || + exists(co_twin_id_chr, envir = processed, inherits = FALSE)) { + next + } + + # O(1) row lookup via named vector + idx1 <- id_to_row[twin_id_chr] + idx2 <- id_to_row[co_twin_id_chr] + + if (is.na(idx1) || is.na(idx2)) next + + # Always put the lower index first for consistency + if (idx1 > idx2) { + tmp <- idx1 + idx1 <- idx2 + idx2 <- tmp + } + + # O(1) insert into hash set + assign(twin_id_chr, TRUE, envir = processed) + assign(co_twin_id_chr, TRUE, envir = processed) + + n_pairs <- n_pairs + 1L + pair_rows[[n_pairs]] <- c(idx1, idx2) + if (returnIDs) { + pair_ids[[n_pairs]] <- c(twin_id, co_twin_id) + } + if (verbose) { + message( + "MZ twin pair found: ", twin_id, " (row ", idx1, + ") and ", co_twin_id, " (row ", idx2, ")" + ) + } + } + + # Trim pre-allocated lists to actual size + if (n_pairs == 0L) { + return(NULL) + } + pair_rows <- pair_rows[seq_len(n_pairs)] + if (returnIDs) { + pair_ids <- pair_ids[seq_len(n_pairs)] + } + + if (returnIDs == TRUE && returnRows == FALSE) { + if (returnAsList == TRUE) { + return(pair_ids) + } else { + data.frame( + twin1_id = vapply(pair_ids, `[`, numeric(1), 1L), + twin2_id = vapply(pair_ids, `[`, numeric(1), 2L) + ) + } + } else if (returnRows == TRUE && returnIDs == FALSE) { + if (returnAsList == TRUE) { + return(pair_rows) + } else { + data.frame( + twin1_row = vapply(pair_rows, `[`, integer(1), 1L), + twin2_row = vapply(pair_rows, `[`, integer(1), 2L) + ) + } + } else if (returnIDs == TRUE && returnRows == TRUE) { + if (returnAsList == TRUE) { + return(list(pair_rows = pair_rows, pair_ids = pair_ids)) + } else { + return(data.frame( + twin1_id = vapply(pair_ids, `[`, numeric(1), 1L), + twin2_id = vapply(pair_ids, `[`, numeric(1), 2L), + twin1_row = vapply(pair_rows, `[`, integer(1), 1L), + twin2_row = vapply(pair_rows, `[`, integer(1), 2L) + )) + } + } else { + stop("Invalid combination of returnRows and returnIDs parameters") + } +} +# replace all MZ twin IDs with the first twin's ID in each pair so they are merged for the path tracing and all subsequent steps. We will copy the values back to the second twin at the end. + +#' Fuse MZ twin pairs in a pedigree dataset for path tracing +#' This function identifies MZ twin pairs in the pedigree dataset and merges their IDs for path tracing purposes. The second twin in each pair is made a founder (with NA parents), and all children of the second twin are redirected to the first twin. This allows for correct relatedness calculations without diagonal or downstream artifacts. +#' @param ped A pedigree data.frame with columns \code{ID}, \code{momID}, \code{dadID}, and optionally \code{twinID} and \code{zygosity}. The function will look for MZ twin pairs based on the \code{twinID} column and optionally restrict to MZ pairs if a \code{zygosity} column is present. +#' @param mz_id_pairs Optional list of length-2 character vectors specifying the IDs of MZ twin pairs to fuse. If provided, this will be used instead of automatically identifying MZ twins from the \code{twinID} column. Each element should be a character vector of length 2, e.g. \code{list(c("ID1", "ID2"), c("ID3", "ID4"))}. +#' @param mz_row_pairs Optional list of length-2 integer vectors specifying the row indices of MZ twin pairs to fuse. If provided, this will be used instead of automatically identifying MZ twins from the \code{twinID} column. Each element should be an integer vector of length 2, e.g. \code{list(c(1, 2), c(3, 4))}. +#' @param test_df_twins logical. If TRUE, return the data frame of twin pairs instead of the modified pedigree. Default is FALSE. +#' @param df_twins Optional data frame with columns \code{twin1_id}, \code{twin2_id}, \code{twin1_row}, and \code{twin2_row} specifying the IDs and row indices of MZ twin pairs to fuse. If provided, this will be used instead of automatically identifying MZ twins from the \code{twinID} column. If this parameter is provided, it takes precedence over \code{mz_id_pairs} and \code{mz_row_pairs}. If \code{test_df_twins} is TRUE, this data frame will be returned for testing purposes instead of performing the fusion. +#' @param beta logical. If TRUE, use an optimized approach with O(1) lookups for large pedigrees when identifying MZ twins. Default is FALSE. +#' @param config A list of configuration options. +#' @return A modified version of the input pedigree data.frame with MZ twin pairs fused for path tracing. If \code{test_df_twins} is TRUE, returns the data frame of identified twin pairs instead. +# 70% of the time is here +fuseTwins <- function(ped, + df_twins = NULL, + mz_id_pairs = NULL, + mz_row_pairs = NULL, + config = list(verbose = FALSE), + test_df_twins = FALSE, + beta = FALSE) { + # make df_twins if not already made, and test it if requested, before proceeding with the fusion. This allows users to provide their own mz_id_pairs or mz_row_pairs and have them converted to df_twins for testing before the fusion is attempted. + + if (!is.null(df_twins)) { + if (!all(c("twin1_id", "twin2_id", "twin1_row", "twin2_row") %in% colnames(df_twins))) { + stop("df_twins must have columns twin1_id, twin2_id, twin1_row, and twin2_row") + } + if (test_df_twins == TRUE) { + return(df_twins) + } + } else if (!is.null(mz_id_pairs) && !is.null(mz_row_pairs) && length(mz_id_pairs) == length(mz_row_pairs)) { + df_twins <- lapply(1:length(mz_id_pairs), function(i) { + twin1_id <- mz_id_pairs[[i]][1] + twin2_id <- mz_id_pairs[[i]][2] + twin1_row <- mz_row_pairs[[i]][1] + twin2_row <- mz_row_pairs[[i]][2] + data.frame( + twin1_id = twin1_id, twin2_id = twin2_id, + twin1_row = twin1_row, twin2_row = twin2_row + ) + }) + df_twins <- do.call(rbind, df_twins) + rownames(df_twins) <- NULL + if (test_df_twins == TRUE) { + return(df_twins) + } + } else if (is.null(mz_id_pairs) && is.null(mz_row_pairs)) { + df_twins <- findMZtwins(ped, + verbose = config$verbose, + returnRows = TRUE, returnIDs = TRUE, returnAsList = FALSE + ) + if (test_df_twins == TRUE) { + return(df_twins) + } + } else if (!is.null(mz_id_pairs) && is.null(mz_row_pairs)) { + df_twins <- lapply(mz_id_pairs, function(pair) { + twin1_row <- which(ped$ID == pair[1]) + twin2_row <- which(ped$ID == pair[2]) + data.frame( + twin1_id = pair[1], twin2_id = pair[2], + twin1_row = twin1_row, twin2_row = twin2_row + ) + }) + df_twins <- do.call(rbind, df_twins) + rownames(df_twins) <- NULL + if (test_df_twins == TRUE) { + return(df_twins) + } + } else if (is.null(mz_id_pairs) && !is.null(mz_row_pairs)) { + df_twins <- lapply(mz_row_pairs, function(row) { + twin1_id <- ped$ID[row[1]] + twin2_id <- ped$ID[row[2]] + data.frame( + twin1_id = twin1_id, + twin2_id = twin2_id, + twin1_row = row[1], + twin2_row = row[2] + ) + }) + df_twins <- do.call(rbind, df_twins) + # remove row names + rownames(df_twins) <- NULL + if (test_df_twins == TRUE) { + return(df_twins) + } + } else { + stop("Invalid input: must provide either mz_id_pairs, mz_row_pairs, or df_twins") + } + + + fuseattemptable <- !is.null(df_twins) || (!is.null(mz_id_pairs) && length(mz_id_pairs) > 0) || (!is.null(mz_row_pairs) && length(mz_row_pairs) > 0) + if (fuseattemptable == TRUE) { + if (config$verbose == TRUE) { + message("MZ twin pairs identified for fusion") + } + + twin1s_id <- df_twins$twin1_id + twin2s_id <- df_twins$twin2_id + twin2s_row <- df_twins$twin2_row + + # Make twin2s founders + # ped$momID[twin2s_row] <- NA + # ped$dadID[twin2s_row] <- NA + + twin2s_as_mom <- which(ped$momID %in% twin2s_id) + twin2s_as_dad <- which(ped$dadID %in% twin2s_id) + # Now redirect all children of twin2 to twin1 + ped$momID[twin2s_as_mom] <- twin1s_id[match(ped$momID[twin2s_as_mom], twin2s_id)] + ped$dadID[twin2s_as_dad] <- twin1s_id[match(ped$dadID[twin2s_as_dad], twin2s_id)] + + if ("spouseID" %in% colnames(ped)) { + twin2s_as_spouse <- which(ped$spouseID %in% twin2s_id) + ped$spouseID[twin2s_as_spouse] <- twin1s_id[match(ped$spouseID[twin2s_as_spouse], twin2s_id)] + } + if ("spID" %in% colnames(ped)) { + twin2s_as_spID <- which(ped$spID %in% twin2s_id) + ped$spID[twin2s_as_spID] <- twin1s_id[match(ped$spID[twin2s_as_spID], twin2s_id)] + } + + if (config$verbose == TRUE) { + message("Merged ", length(twin1s_id), " MZ twin pair(s) in pedigree dataset for path tracing") + } + } else { + if (config$verbose == TRUE) { + message("No MZ twin pair_rows found in pedigree dataset") + } + } + + return(ped) +} diff --git a/R/segmentPedigree.R b/R/segmentPedigree.R index 3179dbfd..d74abfb3 100644 --- a/R/segmentPedigree.R +++ b/R/segmentPedigree.R @@ -10,6 +10,7 @@ #' @param momID character. Name of the column in ped for the mother ID variable #' @param dadID character. Name of the column in ped for the father ID variable #' @param famID character. Name of the column to be created in ped for the family ID variable +#' @param twinID character. Name of the column in ped for the twin ID variable, if applicable #' @param ... additional arguments to be passed to \code{\link{ped2com}} #' @details #' The general idea of this function is to use person ID, mother ID, and father ID to @@ -29,17 +30,25 @@ #' ped2fam <- function(ped, personID = "ID", momID = "momID", dadID = "dadID", famID = "famID", + twinID = "twinID", ...) { # Call to wrapper function - .ped2id(ped = ped, personID = personID, momID = momID, dadID = dadID, famID = famID, type = "parents") + .ped2id( + ped = ped, personID = personID, momID = momID, dadID = dadID, famID = famID, twinID = twinID, + type = "parents" + ) } .ped2id <- function(ped, personID = "ID", momID = "momID", dadID = "dadID", - famID = "famID", type, + famID = "famID", twinID = "twinID", + type, ...) { # Turn pedigree into family - pg <- ped2graph(ped = ped, personID = personID, momID = momID, dadID = dadID, adjacent = type) + pg <- ped2graph( + ped = ped, personID = personID, momID = momID, dadID = dadID, twinID = twinID, + adjacent = type + ) # Find weakly connected components of graph wcc <- igraph::components(pg) @@ -97,6 +106,7 @@ ped2graph <- function(ped, personID = "ID", momID = "momID", dadID = "dadID", + twinID = "twinID", directed = TRUE, adjacent = c("parents", "mothers", "fathers"), ...) { @@ -196,11 +206,14 @@ ped2graph <- function(ped, #' ped2maternal <- function(ped, personID = "ID", momID = "momID", dadID = "dadID", - matID = "matID", ...) { + matID = "matID", + twinID = "twinID", + ...) { # Call to wrapper function .ped2id( ped = ped, personID = personID, momID = momID, - dadID = dadID, famID = matID, type = "mothers" + dadID = dadID, famID = matID, twinID = twinID, + type = "mothers" ) } @@ -219,10 +232,14 @@ ped2maternal <- function(ped, personID = "ID", #' ped2paternal <- function(ped, personID = "ID", momID = "momID", dadID = "dadID", - patID = "patID", ...) { + patID = "patID", + twinID = "twinID", + ...) { # Call to wrapper function .ped2id( ped = ped, personID = personID, momID = momID, - dadID = dadID, famID = patID, type = "fathers" + dadID = dadID, famID = patID, + twinID = twinID, + type = "fathers" ) } diff --git a/R/simulatePedigree.R b/R/simulatePedigree.R index 0517eaa2..9539719c 100644 --- a/R/simulatePedigree.R +++ b/R/simulatePedigree.R @@ -173,8 +173,9 @@ buildBetweenGenerations_base <- function(df_Fam, # count the number of couples in the i th gen - countCouple <- (nrow(df_Ngen) - sum(is.na(df_Ngen$spID))) * .5 - + if (verbose == TRUE) { + countCouple <- (nrow(df_Ngen) - sum(is.na(df_Ngen$spID))) * .5 + } # Assign couple IDs within generation i. df_Ngen <- assignCoupleIds(df_Ngen, beta = beta) @@ -457,8 +458,388 @@ buildBetweenGenerations_base <- function(df_Fam, return(df_Fam) } -buildBetweenGenerations_optimized <- buildBetweenGenerations_base # Placeholder for optimized version +buildBetweenGenerations_optimized <- function(df_Fam, + Ngen, + sizeGens, + verbose = FALSE, + marR, sexR, kpc, + rd_kpc, personID = "ID", + momID = "momID", + dadID = "dadID", + code_male = "M", + code_female = "F", + beta = TRUE) { + # Initialize flags for the full pedigree data frame. + # These are used throughout linkage and get overwritten per-generation as needed. + + df_Fam$ifparent <- FALSE + df_Fam$ifson <- FALSE + df_Fam$ifdau <- FALSE + + # Precompute row indices per generation once. + # This avoids repeated df_Fam$gen == i scans inside loops. + gen_rows <- split(seq_len(nrow(df_Fam)), df_Fam$gen) + + # Loop across generations 1..Ngen. + + for (i in seq_len(Ngen)) { + # ------------------------------------------------------------------------- + # Generation 1: base case + # Generation 1 individuals are founders and are treated as "parents" by design. + # They do not have assigned mother/father, so we just set flags and continue. + # ------------------------------------------------------------------------- + + if (i == 1) { + rows_i <- gen_rows[[as.character(i)]] + df_Ngen <- df_Fam[rows_i, , drop = FALSE] + + # Mark everyone in generation 1 as parents (founder couple logic occurs earlier). + df_Ngen$ifparent <- TRUE + df_Ngen$ifson <- FALSE + df_Ngen$ifdau <- FALSE + df_Fam[rows_i, ] <- df_Ngen + # Write back into the main df_Fam. + } else { + # calculate the number of couples in the i-1 th generation + rows_i <- gen_rows[[as.character(i)]] + rows_prev <- gen_rows[[as.character(i - 1)]] + + # ------------------------------------------------------------------------- + # Step A: Determine how many couples exist in generation i-1 + # + # In your representation, each coupled individual has a non-NA spID, and each couple + # appears twice (one row per spouse). Therefore: + # number_of_couples = (number_of_non_single_individuals) / 2 + # where number_of_non_single_individuals = sizeGens[i-1] - count(NA spID) + # ------------------------------------------------------------------------- + + N_couples <- (sizeGens[i - 1] - sum(is.na(df_Fam$spID[rows_prev]))) * 0.5 + + # Expected number of offspring linked to those couples (before sex split). + + N_LinkedMem <- N_couples * kpc + # Split linked offspring into female and male counts using sexR, + # where sexR is the proportion male, so (1 - sexR) is the proportion female. + + N_LinkedFemale <- round(N_LinkedMem * (1 - sexR)) + N_LinkedMale <- N_LinkedMem - N_LinkedFemale + + + # ------------------------------------------------------------------------- + # Step B: Prepare generation i data, assign couple IDs, and mark potential children + # ------------------------------------------------------------------------- + + # get the df for the i the generation + df_Ngen <- df_Fam[rows_i, , drop = FALSE] + + + # Reset per-generation fields that will be recomputed. + df_Ngen$ifparent <- FALSE + df_Ngen$ifson <- FALSE + df_Ngen$ifdau <- FALSE + df_Ngen$coupleId <- NA_character_ + + # Randomly permute generation i rows so selection is not tied to row order. + df_Ngen <- df_Ngen[sample(nrow(df_Ngen)), , drop = FALSE] + + # Start to connect children with mother and father + + if (verbose == TRUE) { + message( + "Step 2.1: mark a group of potential sons and daughters in the i th generation" + ) + } + + + # count the number of couples in the i th gen + countCouple <- (nrow(df_Ngen) - sum(is.na(df_Ngen$spID))) * .5 + + # Assign couple IDs within generation i. + df_Ngen <- assignCoupleIds(df_Ngen, beta = beta) + + # Identify singles in generation i (no spouse). + + IdSingle <- df_Ngen$id[is.na(df_Ngen$spID)] + + # Count singles by sex; these affect how many "linked" children can come from couples. + SingleF <- sum(df_Ngen$sex == code_female & is.na(df_Ngen$spID)) + SingleM <- sum(df_Ngen$sex == code_male & is.na(df_Ngen$spID)) + + # Number of linked females that must come from couples after excluding single females. + # This value is passed into markPotentialChildren, which decides who becomes ifson/ifdau. + + CoupleF <- N_LinkedFemale - SingleF + + # Mark potential sons and daughters within generation i. + # This writes ifson/ifdau into the returned data frame + df_Fam[rows_i, ] <- markPotentialChildren( + df_Ngen = df_Ngen, + i = i, + Ngen = Ngen, + sizeGens = sizeGens, + CoupleF = CoupleF, + code_male = code_male, + code_female = code_female, + beta = beta + ) + + # ------------------------------------------------------------------------- + # Step C: Mark a subset of generation i-1 couples as parents (ifparent) + # + # OPTIMIZATION: Instead of looping through individuals and doing linear + # spouse lookups (O(n²)), we pre-identify all couples using vectorized + # operations, sample the needed number of couples directly, and mark both + # spouses in one vectorized operation (O(n)). + # + # Goal: choose enough married couples (based on marR) to be parents. + # ------------------------------------------------------------------------- + + if (verbose == TRUE) { + message( + "Step 2.2: mark a group of potential parents in the i-1 th generation" + ) + } + df_Ngen <- df_Fam[rows_prev, , drop = FALSE] + + # Reset flags within i-1 before reselecting parent couples. + df_Ngen$ifparent <- FALSE + df_Ngen$ifson <- FALSE + df_Ngen$ifdau <- FALSE + + # Randomize order so parent selection is not tied to row ordering. + # This matches the base version and ensures similar random behavior. + df_Ngen <- df_Ngen[sample(nrow(df_Ngen)), , drop = FALSE] + + # OPTIMIZED: Fully vectorized parent couple selection + # Process all couples at once instead of looping through individuals + + # Identify individuals with spouses + has_spouse <- !is.na(df_Ngen$spID) + + if (any(has_spouse)) { + # Create symmetric couple keys for ALL rows (NA for singles) + couple_keys_all <- ifelse( + has_spouse, + paste( + pmin(df_Ngen$id, df_Ngen$spID), + pmax(df_Ngen$id, df_Ngen$spID), + sep = "_" + ), + NA_character_ + ) + + # Find first occurrence of each couple using !duplicated() + # This gives us unique couples in the order they appear (after randomization) + first_occurrence <- !duplicated(couple_keys_all) & has_spouse + + # Get the unique couple keys in order + unique_couples_ordered <- couple_keys_all[first_occurrence] + + # Calculate how many couples to select + # Target: marR proportion of individuals = (marR * n) / 2 couples + n_couples_target <- ceiling(sizeGens[i - 1] * marR / 2) + n_couples_target <- min(n_couples_target, length(unique_couples_ordered)) + + # Select first n couples (in randomized order from the shuffling above) + selected_couples <- unique_couples_ordered[seq_len(n_couples_target)] + + # Mark all individuals in selected couples as parents (vectorized) + df_Ngen$ifparent <- couple_keys_all %in% selected_couples + } else { + df_Ngen$ifparent <- FALSE + } + + df_Fam[rows_prev, ] <- df_Ngen + + if (verbose == TRUE) { + message( + "Step 2.3: connect the i and i-1 th generation" + ) + } + + + if (i == 1) { + next + } else { + # Pull the two generations together. + # OPTIMIZATION: Use pre-computed row indices instead of df_Fam$gen %in% c(i, i-1) + df_Ngen <- df_Fam[c(rows_prev, rows_i), , drop = FALSE] + + sizeI <- sizeGens[i - 1] + sizeII <- sizeGens[i] + + # Collect IDs of marked sons and daughters in generation i. + IdSon <- df_Ngen$id[df_Ngen$ifson == TRUE & df_Ngen$gen == i] + IdDau <- df_Ngen$id[df_Ngen$ifdau == TRUE & df_Ngen$gen == i] + # Interleave sons and daughters to get an offspring list. + IdOfp <- evenInsert(IdSon, IdDau) + + # nMates is number of parent couples selected (ifparent rows are individuals). + nMates <- sum(df_Ngen$ifparent) / 2 + + # If no mates or no offspring were selected for linkage, skip linkage. + if (nMates <= 0 || length(IdOfp) == 0) { + df_Fam[rows_i, ] <- df_Ngen[df_Ngen$gen == i, ] + df_Fam[rows_prev, ] <- df_Ngen[df_Ngen$gen == i - 1, ] + next + } + + # generate link kids to the couples + random_numbers <- adjustKidsPerCouple( + nMates = sum(df_Ngen$ifparent) / 2, kpc = kpc, + rd_kpc = rd_kpc, + beta = beta + ) + + # Guard: adjustKidsPerCouple returned nothing usable + if (length(random_numbers) == 0 || all(is.na(random_numbers))) { + df_Fam[rows_i, ] <- df_Ngen[df_Ngen$gen == i, ] + df_Fam[rows_prev, ] <- df_Ngen[df_Ngen$gen == i - 1, ] + next + } + + # ------------------------------------------------------------------------- + # Step E: Build parent assignment vectors IdMa and IdPa + # + # The goal is to expand couples into per-child vectors of mother IDs and father IDs, + # where each couple contributes random_numbers[couple_index] children. + # + # Important: df_Ngen contains both generations. We only want parent generation rows. + # ------------------------------------------------------------------------- + + # Identify rows in df_Ngen that belong to generation i-1 (parent generation). + rows_prev_in_pair <- which(df_Ngen$gen == (i - 1)) + + # Extract parent generation into a smaller frame to make operations faster and clearer. + prev <- df_Ngen[rows_prev_in_pair, , drop = FALSE] + + # Keep only those rows that are marked ifparent and are actually paired (non-NA spID). + parent_rows <- which(prev$ifparent == TRUE & !is.na(prev$spID)) + + # If no usable parent couples remain, skip linkage. + if (length(parent_rows) == 0) { + df_Fam[rows_i, ] <- df_Ngen[df_Ngen$gen == i, ] + df_Fam[rows_prev, ] <- df_Ngen[df_Ngen$gen == i - 1, ] + next + } + # Create a symmetric couple key so we can keep only one row per couple. + a <- pmin(prev$id, prev$spID) + b <- pmax(prev$id, prev$spID) + couple_key <- paste(a, b, sep = "_") + + # Keep only the first row for each couple among the parent rows. + parent_rows <- parent_rows[!duplicated(couple_key[parent_rows])] + + # Determine whether each kept row corresponds to the female member of the couple. + # If the kept row is female: mother = id, father = spID + # If the kept row is male: father = id, mother = spID + is_female_row <- prev$sex[parent_rows] == code_female + # One mother ID per couple. + ma_ids <- ifelse(is_female_row, prev$id[parent_rows], prev$spID[parent_rows]) + + # One father ID per couple. + pa_ids <- ifelse(is_female_row, prev$spID[parent_rows], prev$id[parent_rows]) + + # Align lengths between couples and random_numbers. + # If random_numbers is longer than couples, truncate random_numbers. + # If random_numbers is shorter than couples, drop extra couples. + nCouples <- length(parent_rows) + + if (length(random_numbers) > nCouples) { + random_numbers <- random_numbers[seq_len(nCouples)] + } else if (length(random_numbers) < nCouples) { + keep <- seq_len(length(random_numbers)) + ma_ids <- ma_ids[keep] + pa_ids <- pa_ids[keep] + } + + # Expand from "one mother/father per couple" to "one mother/father per child". + # rep.int is used to avoid extra overhead. + IdMa <- rep.int(ma_ids, times = random_numbers) + IdPa <- rep.int(pa_ids, times = random_numbers) + + # ------------------------------------------------------------------------- + # Step F: Ensure IdMa/IdPa length matches the number of offspring IdOfp + # + # Two mismatch cases: + # 1) Too many parent slots relative to offspring: drop excess parent slots. + # 2) Too many offspring relative to parent slots: drop some offspring. + # + # drop singles first (IdSingle) when reducing offspring. + # ------------------------------------------------------------------------- + + + if (length(IdPa) - length(IdOfp) > 0) { + if (verbose == TRUE) { + message("length of IdPa", length(IdPa), "\n") + } + # Excess parent slots: randomly remove that many entries from IdPa and IdMa. + + excess <- length(IdPa) - length(IdOfp) + if (length(IdPa) > 0 && excess > 0) { + IdRm <- sample.int(length(IdPa), size = excess) + IdPa <- IdPa[-IdRm] + IdMa <- IdMa[-IdRm] + } + } else if (length(IdPa) - length(IdOfp) < 0) { + if (verbose == TRUE) { + message("length of IdOfp", length(IdOfp), "\n") + message("length of IdPa", length(IdPa), "\n") + message("length of IdSingle", length(IdMa), "\n") + } + + + # harden the resample call when IdSingle is empty: + # Need to drop some offspring because we do not have enough parent slots. + need_drop <- length(IdOfp) - length(IdPa) + + if (need_drop > 0) { + if (length(IdSingle) > 0) { + # Preferentially remove offspring IDs that correspond to singles. + # resample is expected to return a vector of IDs to remove. + + IdRm <- resample(IdSingle, size = need_drop) + IdOfp <- IdOfp[!(IdOfp %in% IdRm)] + } else { + # If there are no singles to target, drop arbitrary offspring indices. + drop_idx <- sample.int(length(IdOfp), size = need_drop) + IdOfp <- IdOfp[-drop_idx] + } + } + } + + # ------------------------------------------------------------------------- + # Step G: Assign pat/mat into df_Ngen for the selected offspring. + # + # Replaces the old loop: + # for (m in seq_along(IdOfp)) df_Ngen[df_Ngen$id == IdOfp[m], "pat"] <- ... + # Using match avoids repeated scanning over df_Ngen$id. + # ------------------------------------------------------------------------- + + # Find row positions in df_Ngen corresponding to offspring IDs. + child_rows <- match(IdOfp, df_Ngen$id) + # Only keep rows that matched successfully. + + ok <- !is.na(child_rows) + + if (any(ok)) { + # Assign father IDs and mother IDs to offspring rows. + + df_Ngen$pat[child_rows[ok]] <- IdPa[ok] + df_Ngen$mat[child_rows[ok]] <- IdMa[ok] + } + # ------------------------------------------------------------------------- + # Step H: Write the two generations back into df_Fam using the precomputed indices. + # ------------------------------------------------------------------------- + + df_Fam[rows_i, ] <- df_Ngen[df_Ngen$gen == i, ] + df_Fam[rows_prev, ] <- df_Ngen[df_Ngen$gen == i - 1, ] + } + } + } + return(df_Fam) +} #' Simulate Pedigrees #' This function simulates "balanced" pedigrees based on a group of parameters: @@ -493,7 +874,18 @@ buildBetweenGenerations_optimized <- buildBetweenGenerations_base # Placeholder #' @param code_female The value to use for females. Default is "F" #' @param fam_shift An integer to shift the person ID. Default is 1L. #' This is useful when simulating multiple pedigrees to avoid ID conflicts. -#' @param beta logical. If TRUE, use the optimized version of the algorithm. +#' @param beta logical or character. Controls which algorithm version to use: +#' \itemize{ +#' \item{\code{FALSE}, \code{"base"}, or \code{"original"} (default): Use the original algorithm. +#' Slower but ensures exact reproducibility with set.seed().} +#' \item{\code{TRUE} or \code{"optimized"}: Use the optimized algorithm with 4-5x speedup. +#' Produces statistically equivalent results but not identical to base version +#' due to different random number consumption. Recommended for large simulations +#' where speed matters more than exact reproducibility.} +#' } +#' Note: Both versions are mathematically correct and produce valid pedigrees with the +#' same statistical properties (sex ratios, mating rates, etc.). The optimized version +#' uses vectorized operations instead of loops, making it much faster for large pedigrees. #' @param ... Additional arguments to be passed to other functions. #' @inheritParams ped2fam #' @param spouseID The name of the column that will contain the spouse ID in the output data frame. Default is "spID". diff --git a/R/tweakPedigree.R b/R/tweakPedigree.R index 88f96128..d4163fcd 100644 --- a/R/tweakPedigree.R +++ b/R/tweakPedigree.R @@ -9,6 +9,7 @@ #' @param verbose logical. If TRUE, print progress through stages of algorithm #' @param gen_twin A vector of \code{generation} of the twin to be imputed. #' @param zygosity A character string indicating the zygosity of the twins. Default is "MZ" for monozygotic twins. +#' @param twin_sex A character string indicating the sex of the twins. Default is randomly assigned ("R"). If specified, it should be either "M" or "F" #' @return Returns a \code{data.frame} with MZ twins information added as a new column. #' @export @@ -18,7 +19,8 @@ makeTwins <- function(ped, ID_twin1 = NA_integer_, ID_twin2 = NA_integer_, gen_twin = 2, verbose = FALSE, - zygosity = "MZ") { + zygosity = "MZ", + twin_sex = "R") { # Check if the ped is the same format as the output of simulatePedigree if (paste0(colnames(ped), collapse = "") != paste0(c( "famID", "ID", "gen", @@ -30,37 +32,57 @@ makeTwins <- function(ped, ID_twin1 = NA_integer_, } # stop("The input pedigree is not in the same format as the output of simulatePedigree") } + ped$MZtwin <- NA_integer_ - ped$zygosity <- NA_character_ + ped$MZzygosity <- NA_character_ # Check if the two IDs are provided - if (is.na(ID_twin1) || is.na(ID_twin2)) { + + # Build ID-to-row index for O(1) attribute lookups + id_row_map <- seq_len(nrow(ped)) + names(id_row_map) <- as.character(ped$ID) + + if (is.na(ID_twin1) && is.na(ID_twin2)) { # Check if the generation is provided if (is.na(gen_twin)) { stop("You should provide either the IDs of the twins or the generation of the twins") } else { # Check if the generation is valid if (gen_twin < 2 || gen_twin > max(ped$gen)) { - stop("The generation of the twins should be an integer between 2 and the maximum generation in the pedigree") + warning("The generation of the twins should be an integer between 2 and the maximum generation in the pedigree") + # remove the MZtwin and zygosity columns + ped$MZzygosity <- NULL + ped$MZtwin <- NULL + return(ped) } else { idx <- nrow(ped[ped$gen == gen_twin & !is.na(ped$dadID), ]) usedID <- c() # randomly loop through all the individuals in the generation until find an individual who is the same sex and shares the same dadID and momID with another individual + # Pre-compute generation mask once to avoid repeated subsetting + gen_mask <- ped$gen == gen_twin & !is.na(ped$dadID) + gen_ids <- ped$ID[gen_mask] + # Build ID-to-row index for O(1) attribute lookups + id_row_map <- seq_len(nrow(ped)) + names(id_row_map) <- as.character(ped$ID) for (i in 1:idx) { - # cat("loop", i, "\n") # check if i is equal to the number of individuals in the generation usedID <- c(usedID, ID_twin1) # message(usedID) if (i < idx) { # randomly select one individual from the generation - ID_twin1 <- resample(ped$ID[ped$gen == gen_twin & !(ped$ID %in% usedID) & !is.na(ped$dadID)], 1) - # cat("twin1", ID_twin1, "\n") + ID_twin1 <- resample(ped$ID[gen_mask & !(ped$ID %in% usedID)], 1) + # Cache twin1 attributes via O(1) row lookup + twin1_row <- id_row_map[as.character(ID_twin1)] + twin1_sex <- ped$sex[twin1_row] + twin1_dad <- ped$dadID[twin1_row] + twin1_mom <- ped$momID[twin1_row] # find one same sex sibling who has the same dadID and momID as the selected individual - if (zygosity %in% c("MZ", "SS")) { - twin2_Pool <- ped$ID[ped$ID != ID_twin1 & ped$gen == gen_twin & ped$sex == ped$sex[ped$ID == ID_twin1] & ped$dadID == ped$dadID[ped$ID == ID_twin1] & ped$momID == ped$momID[ped$ID == ID_twin1]] - } else if (zygosity == "DZ") { - twin2_Pool <- ped$ID[ped$ID != ID_twin1 & ped$gen == gen_twin & ped$dadID == ped$dadID[ped$ID == ID_twin1] & ped$momID == ped$momID[ped$ID == ID_twin1]] - } else if (zygosity == "OS") { - twin2_Pool <- ped$ID[ped$ID != ID_twin1 & ped$gen == gen_twin & ped$sex != ped$sex[ped$ID == ID_twin1] & ped$dadID == ped$dadID[ped$ID == ID_twin1] & ped$momID == ped$momID[ped$ID == ID_twin1]] + sib_mask <- ped$ID != ID_twin1 & gen_mask & ped$dadID == twin1_dad & ped$momID == twin1_mom + if (zygosity %in% c("mz", "MZ", "SS", "ss")) { + twin2_Pool <- ped$ID[sib_mask & ped$sex == twin1_sex] + } else if (zygosity %in% c("DZ", "dz")) { + twin2_Pool <- ped$ID[sib_mask] + } else if (zygosity %in% c("OS", "os")) { + twin2_Pool <- ped$ID[sib_mask & ped$sex != twin1_sex] } else { stop("The zygosity should be either 'MZ', 'DZ', or 'OS'") } @@ -81,7 +103,14 @@ makeTwins <- function(ped, ID_twin1 = NA_integer_, } else { # randomly select all males or females in the generation and put them in a vector if (zygosity %in% c("MZ", "SS")) { - selectGender <- ped$ID[ped$gen == gen_twin & ped$sex == resample(c("M", "F"), 1) & !is.na(ped$dadID) & !is.na(ped$momID)] + if (twin_sex == "R") { + twin_sex_select <- resample(c("M", "F"), 1) + } else { + twin_sex_select <- twin_sex + } + + selectGender <- ped$ID[ped$gen == gen_twin & ped$sex == twin_sex_select & !is.na(ped$dadID) & !is.na(ped$momID)] + notselectGender <- ped$ID[ped$gen == gen_twin & ped$sex != twin_sex_select & !is.na(ped$dadID) & !is.na(ped$momID)] } else if (zygosity %in% c("DZ")) { selectGender <- ped$ID[ped$gen == gen_twin & !is.na(ped$dadID) & !is.na(ped$momID)] } else if (zygosity %in% c("OS")) { @@ -91,9 +120,13 @@ makeTwins <- function(ped, ID_twin1 = NA_integer_, } # message(selectGender) - if (length(selectGender) < 2) { + if (length(selectGender) < 2 && length(notselectGender) < 2) { stop("There are no available same-sex people in the generation to make twins") + } else if (twin_sex == "R" && length(selectGender) < 2 && length(notselectGender) >= 2) { + selectGender <- notselectGender } + + # randomly select two individuals from the vector ID_DoubleTwin <- resample(selectGender, 2) # message(ID_DoubleTwin) @@ -112,18 +145,81 @@ makeTwins <- function(ped, ID_twin1 = NA_integer_, # Set the zygosity of the twins } } - } else { + } else if (!is.na(ID_twin1) && !is.na(ID_twin2)) { # Impute the IDs of the twin in the MZtwin column ped$MZtwin[ped$ID == ID_twin1] <- ID_twin2 ped$MZtwin[ped$ID == ID_twin2] <- ID_twin1 + } else if (!is.na(ID_twin1) && is.na(ID_twin2)) { + twin1_row <- id_row_map[as.character(ID_twin1)] + twin1_sex <- ped$sex[twin1_row] + twin1_dad <- ped$dadID[twin1_row] + twin1_mom <- ped$momID[twin1_row] + sib_mask <- ped$ID != ID_twin1 & !is.na(ped$dadID) & ped$dadID == twin1_dad & !is.na(ped$momID) & ped$momID == twin1_mom + if (zygosity %in% c("mz", "MZ", "SS", "ss")) { + twin2_Pool <- ped$ID[sib_mask & ped$sex == twin1_sex] + } else if (zygosity %in% c("DZ", "dz")) { + twin2_Pool <- ped$ID[sib_mask] + } else if (zygosity %in% c("OS", "os")) { + twin2_Pool <- ped$ID[sib_mask & ped$sex != twin1_sex] + } else { + stop("The zygosity should be either 'MZ', 'DZ', or 'OS'") + } + twin2_Pool <- twin2_Pool[!is.na(twin2_Pool)] + if (length(twin2_Pool) == 0) { + stop("No suitable sibling found for ID_twin1 = ", ID_twin1, " with zygosity '", zygosity, "'") + } + ID_twin2 <- resample(twin2_Pool, 1) + if (verbose) cat("Auto-selected ID_twin2 =", ID_twin2, "\n") + } else if (is.na(ID_twin1) && !is.na(ID_twin2)) { + # Mirror: find a match for twin2 + twin2_row <- id_row_map[as.character(ID_twin2)] + twin2_sex <- ped$sex[twin2_row] + twin2_dad <- ped$dadID[twin2_row] + twin2_mom <- ped$momID[twin2_row] + sib_mask <- ped$ID != ID_twin2 & !is.na(ped$dadID) & ped$dadID == twin2_dad & !is.na(ped$momID) & ped$momID == twin2_mom + if (zygosity %in% c("mz", "MZ", "SS", "ss")) { + twin1_Pool <- ped$ID[sib_mask & ped$sex == twin2_sex] + } else if (zygosity %in% c("DZ", "dz")) { + twin1_Pool <- ped$ID[sib_mask] + } else if (zygosity %in% c("OS", "os")) { + twin1_Pool <- ped$ID[sib_mask & ped$sex != twin2_sex] + } else { + stop("The zygosity should be either 'MZ', 'DZ', or 'OS'") + } + + + twin1_Pool <- twin1_Pool[!is.na(twin1_Pool)] + if (length(twin1_Pool) == 0) { + stop("No suitable sibling found for ID_twin2 = ", ID_twin2, " with zygosity '", zygosity, "'") + } + ID_twin1 <- resample(twin1_Pool, 1) + if (verbose) cat("Auto-selected ID_twin1 =", ID_twin1, "\n") } + + if (verbose == TRUE) { cat("twin1", ID_twin1, "\n") cat("twin2", ID_twin2, "\n") } - names(ped)[names(ped) == "MZtwin"] <- "twinID" - ped$zygosity[ped$ID == ID_twin1] <- zygosity - ped$zygosity[ped$ID == ID_twin2] <- zygosity + ped$MZtwin[ped$ID == ID_twin1] <- ID_twin2 + ped$MZtwin[ped$ID == ID_twin2] <- ID_twin1 + + if ("twinID" %in% colnames(ped)) { + ped$twinID[!is.na(ped$MZtwin)] <- ped$MZtwin[!is.na(ped$MZtwin)] + ped$MZtwin <- NULL + } else { + names(ped)[names(ped) == "MZtwin"] <- "twinID" + } + if ("zygosity" %in% colnames(ped)) { + ped$zygosity[ped$ID == ID_twin1] <- zygosity + ped$zygosity[ped$ID == ID_twin2] <- zygosity + ped$MZzygosity <- NULL + } else { + names(ped)[names(ped) == "MZzygosity"] <- "zygosity" + ped$zygosity[ped$ID == ID_twin1] <- zygosity + ped$zygosity[ped$ID == ID_twin2] <- zygosity + } + return(ped) } @@ -139,6 +235,7 @@ makeTwins <- function(ped, ID_twin1 = NA_integer_, #' @param verbose logical. If TRUE, print progress through stages of algorithm #' @param gen_inbred A vector of \code{generation} of the twin to be imputed. #' @param type_inbred A character vector indicating the type of inbreeding. "sib" for sibling inbreeding and "cousin" for cousin inbreeding. +#' @param prefer_unmated A logical indicating whether to prefer unmated siblings when automatically selecting inbred mates. Default is FALSE, which means the function will consider all siblings regardless of their mating status. #' @return Returns a \code{data.frame} with some inbred mates. #' @details #' This function creates inbred mates in the simulated pedigree \code{data.frame}. This function's purpose is to evaluate the effect of inbreeding on model fitting and parameter estimation. In case it needs to be said, we do not condone inbreeding in real life. But we recognize that it is a common practice in some fields to create inbred strains for research purposes. @@ -150,7 +247,8 @@ makeInbreeding <- function(ped, ID_mate2 = NA_integer_, verbose = FALSE, gen_inbred = 2, - type_inbred = "sib") { + type_inbred = "sib", + prefer_unmated = FALSE) { # check if the ped is the same format as the output of simulatePedigree if (paste0(colnames(ped), @@ -175,8 +273,67 @@ makeInbreeding <- function(ped, stop("The type of inbreeding should be either 'sib' or 'cousin'") return() } - # check if the two IDs are provided - if (is.na(ID_mate1) || is.na(ID_mate2)) { + + if (!is.na(ID_mate1) && is.na(ID_mate2)) { + mate1_sex <- ped$sex[ped$ID == ID_mate1] + mate1_dad <- ped$dadID[ped$ID == ID_mate1] + mate1_mom <- ped$momID[ped$ID == ID_mate1] + if (!is.na(mate1_dad) && !is.na(mate1_mom)) { + # prefer unmated opposite-sex sibling + if (prefer_unmated == TRUE) { + pool <- makePool( + ped = ped, mate_id = ID_mate1, mate_sex = mate1_sex, + mate_dad = mate1_dad, mate_mom = mate1_mom, prefer_unmated = TRUE + ) + + if (length(pool) == 0) { + # fall back to mated opposite-sex sibling + pool <- makePool( + ped = ped, mate_id = ID_mate1, mate_sex = mate1_sex, + mate_dad = mate1_dad, mate_mom = mate1_mom, prefer_unmated = FALSE + ) + } + } else { + pool <- makePool( + ped = ped, mate_id = ID_mate1, mate_sex = mate1_sex, + mate_dad = mate1_dad, mate_mom = mate1_mom, prefer_unmated = FALSE + ) + } + + if (length(pool) > 0) { + ID_mate2 <- resample(pool, 1) + if (verbose) message("Auto-selected ID_mate2 = ", ID_mate2) + } + } + } else if (is.na(ID_mate1) && !is.na(ID_mate2)) { + mate2_sex <- ped$sex[ped$ID == ID_mate2] + mate2_dad <- ped$dadID[ped$ID == ID_mate2] + mate2_mom <- ped$momID[ped$ID == ID_mate2] + if (!is.na(mate2_dad) && !is.na(mate2_mom)) { + if (prefer_unmated) { + pool <- makePool( + ped = ped, mate_id = ID_mate2, mate_sex = mate2_sex, + mate_dad = mate2_dad, mate_mom = mate2_mom, prefer_unmated = TRUE + ) + # fall back + if (length(pool) == 0) { + pool <- makePool( + ped = ped, mate_id = ID_mate2, mate_sex = mate2_sex, + mate_dad = mate2_dad, mate_mom = mate2_mom, prefer_unmated = FALSE + ) + } + } else { + pool <- makePool( + ped = ped, mate_id = ID_mate2, mate_sex = mate2_sex, + mate_dad = mate2_dad, mate_mom = mate2_mom, prefer_unmated = FALSE + ) + } + if (length(pool) > 0) { + ID_mate1 <- resample(pool, 1) + if (verbose) message("Auto-selected ID_mate1 = ", ID_mate1) + } + } # check if the two IDs are provided + } else if (is.na(ID_mate1) && is.na(ID_mate2)) { # Check if the generation is provided if (is.na(gen_inbred)) { stop("You should provide either the IDs of the inbred mates or the generation of the inbred mates") @@ -195,15 +352,34 @@ makeInbreeding <- function(ped, if (!is.na(ID_mate2)) { break } - ID_pool_mate1 <- ped$ID[ped$gen == gen_inbred & !is.na(ped$dadID) & !is.na(ped$momID) & is.na(ped$spID) & !(ped$ID %in% usedID)] - # if the pool is empty, find all individuals who have the same dadID and momID as the selected individual but mated - if (length(ID_pool_mate1) == 0) { + + if (prefer_unmated) { + # try to find one opposite + ID_pool_mate1 <- ped$ID[ped$gen == gen_inbred & !is.na(ped$dadID) & !is.na(ped$momID) & is.na(ped$spID) & !(ped$ID %in% usedID)] + + + # if the pool is empty, find all individuals who have the same dadID and momID as the selected individual but mated + if (length(ID_pool_mate1) == 0) { + ID_pool_mate1 <- ped$ID[ped$gen == gen_inbred & !is.na(ped$dadID) & !is.na(ped$momID) & !(ped$ID %in% usedID)] + } + } else { ID_pool_mate1 <- ped$ID[ped$gen == gen_inbred & !is.na(ped$dadID) & !is.na(ped$momID) & !(ped$ID %in% usedID)] } + ID_mate1 <- resample(ID_pool_mate1, 1) usedID <- c(usedID, ID_mate1) # try to find one opposite-sex individual who has the same dadID and momID as the selected individual, preferalbly not mated - ID_pool_mate2 <- ped$ID[ped$gen == gen_inbred & ped$sex != ped$sex[ped$ID == ID_mate1] & ped$dadID == ped$dadID[ped$ID == ID_mate1] & ped$momID == ped$momID[ped$ID == ID_mate1] & is.na(ped$spID)] + if (prefer_unmated) { + ID_pool_mate2 <- ped$ID[ped$gen == gen_inbred & ped$sex != ped$sex[ped$ID == ID_mate1] & ped$dadID == ped$dadID[ped$ID == ID_mate1] & ped$momID == ped$momID[ped$ID == ID_mate1] & is.na(ped$spID)] + # back up + if (length(ID_pool_mate1) == 0) { + ID_pool_mate2 <- ped$ID[ped$gen == gen_inbred & ped$sex != ped$sex[ped$ID == ID_mate1] & ped$dadID == ped$dadID[ped$ID == ID_mate1] & ped$momID == ped$momID[ped$ID == ID_mate1]] + } + } else { + # ID_pool_mate2 <- ped$ID[ped$gen == gen_inbred & ped$sex != ped$sex[ped$ID == ID_mate1] & ped$dadID == ped$dadID[ped$ID == ID_mate1] & ped$momID == ped$momID[ped$ID == ID_mate1]] + ID_pool_mate2 <- makePool(ped = ped, mate_id = ID_mate1, mate_sex = ped$sex[ped$ID == ID_mate1], mate_dad = ped$dadID[ped$ID == ID_mate1], mate_mom = ped$momID[ped$ID == ID_mate1], prefer_unmated = prefer_unmated, gen_inbred = gen_inbred) + } + # if the pool is not empty, randomly select one individual from the pool if (length(ID_pool_mate2) > 0) { ID_mate2 <- resample(ID_pool_mate2, 1) @@ -237,20 +413,18 @@ makeInbreeding <- function(ped, # change the spouseID of ID_mate1 and ID_mate2 to each other ped$spID[ped$ID == ID_mate1] <- ID_mate2 ped$spID[ped$ID == ID_mate2] <- ID_mate1 - # change the individuals in next generation whose dadID and momID are ID_mate1 and ID_mate2's former mates to ID_mate1 and ID_mate2 - for (j in seq_len(nrow(ped))) { - if (!is.na(ped$dadID[j]) & !is.na(ID_mate1_former_mate) & ped$dadID[j] == ID_mate1_former_mate) { - ped$dadID[j] <- ID_mate2 - } - if (!is.na(ped$momID[j]) & !is.na(ID_mate1_former_mate) & ped$momID[j] == ID_mate1_former_mate) { - ped$momID[j] <- ID_mate2 - } - if (!is.na(ped$dadID[j]) & !is.na(ID_mate2_former_mate) & ped$dadID[j] == ID_mate2_former_mate) { - ped$dadID[j] <- ID_mate1 - } - if (!is.na(ped$momID[j]) & !is.na(ID_mate2_former_mate) & ped$momID[j] == ID_mate2_former_mate) { - ped$momID[j] <- ID_mate1 - } + # Vectorized replacement of former mates' IDs in children's parent columns + if (!is.na(ID_mate1_former_mate)) { + dad_match1 <- which(!is.na(ped$dadID) & ped$dadID == ID_mate1_former_mate) + if (length(dad_match1) > 0) ped$dadID[dad_match1] <- ID_mate2 + mom_match1 <- which(!is.na(ped$momID) & ped$momID == ID_mate1_former_mate) + if (length(mom_match1) > 0) ped$momID[mom_match1] <- ID_mate2 + } + if (!is.na(ID_mate2_former_mate)) { + dad_match2 <- which(!is.na(ped$dadID) & ped$dadID == ID_mate2_former_mate) + if (length(dad_match2) > 0) ped$dadID[dad_match2] <- ID_mate1 + mom_match2 <- which(!is.na(ped$momID) & ped$momID == ID_mate2_former_mate) + if (length(mom_match2) > 0) ped$momID[mom_match2] <- ID_mate1 } return(ped) } @@ -265,13 +439,28 @@ makeInbreeding <- function(ped, #' @param gen_drop the generation in which the randomly dropped person is. Will work if `ID_drop` is not specified. #' @param sex_drop the biological sex of the randomly dropped person. #' @param n_drop the number of times the mutation happens. +#' @param verbose logical. If TRUE, print progress through stages of algorithm #' @return a pedigree with the dropped person's `dadID` and `momID` set to NA. #' @export dropLink <- function(ped, ID_drop = NA_integer_, gen_drop = 2, sex_drop = NA_character_, - n_drop = 1) { + n_drop = 1, + verbose = FALSE) { + # Standardize column names for consistency with other functions + + if (paste0(colnames(ped), + collapse = "" + ) != paste0( + c("famID", "ID", "gen", "dadID", "momID", "spID", "sex"), + collapse = "" + )) { + ped <- standardizeColnames(ped, verbose = verbose) + if (verbose == TRUE) { + message("The input pedigree is not in the same format as the output of simulatePedigree\n") + } + } # check if the ID_drop is specified if (is.na(ID_drop)) { # check if the sex_drop is specified @@ -406,3 +595,27 @@ addPersonToPed <- function(ped, name = NULL, return(rbind(ped, new_row)) } } + + +makePool <- function(ped, mate_id, mate_sex, mate_dad, mate_mom, prefer_unmated = FALSE, + gen_inbred = NULL, usedID = NULL) { + if (!is.null(gen_inbred)) { + ped <- ped[ped$gen == gen_inbred, ] + } + if (!is.null(usedID)) { + ped <- ped[!(ped$ID %in% usedID), ] + } + + + # should we prefer unmated siblings when automatically selecting inbred mates? If yes, we will only consider unmated siblings. If no, we will consider all siblings regardless of their mating status. + if (prefer_unmated == TRUE) { + mated_mask <- is.na(ped$spID) + } else { + mated_mask <- rep(TRUE, nrow(ped)) + } + + pool <- ped$ID[ped$ID != mate_id & ped$sex != mate_sex & + !is.na(ped$dadID) & ped$dadID == mate_dad & + !is.na(ped$momID) & ped$momID == mate_mom & mated_mask] + return(pool) +} diff --git a/data-raw/optimizing.R b/data-raw/optimizing.R deleted file mode 100644 index ab79fd03..00000000 --- a/data-raw/optimizing.R +++ /dev/null @@ -1,83 +0,0 @@ -library(profvis) -library(microbenchmark) -library(tidyverse) -set.seed(1667) -Ngen <- 3 -kpc <- 4 -sexR <- .50 # sometimes fails above .5 -marR <- .7 -reps <- 10 -if (FALSE) { - profvis({ - simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = FALSE) - }) - - profvis({ - simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = TRUE) - }) -} - - -benchmark_results <- microbenchmark( - beta_false_1gen = { - simulatePedigree(kpc = kpc, Ngen = 1, sexR = sexR, marR = marR, beta = FALSE) - }, - beta_true_1gen = { - simulatePedigree(kpc = kpc, Ngen = 1, sexR = sexR, marR = marR, beta = TRUE) - }, - beta_false_lowgen = { - simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = FALSE) - }, - beta_true_lowgen = { - simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = TRUE) - }, - beta_false_midgen = { - simulatePedigree(kpc = kpc, Ngen = Ngen * 2, sexR = sexR, marR = marR, beta = FALSE) - }, - beta_true_midgen = { - simulatePedigree(kpc = kpc, Ngen = Ngen * 2, sexR = sexR, marR = marR, beta = TRUE) - }, - beta_false_highgen = { - simulatePedigree(kpc = kpc, Ngen = Ngen * 3, sexR = sexR, marR = marR, beta = FALSE) - }, - beta_true_highgen = { - simulatePedigree(kpc = kpc, Ngen = Ngen * 3, sexR = sexR, marR = marR, beta = TRUE) - }, - times = reps # Run each method 10 times -) - -benchmark_results <- benchmark_results %>% - mutate( - beta_factor = factor(case_when( - grepl("beta_true", expr) ~ "TRUE", - grepl("beta_false", expr) ~ "FALSE", - grepl("beta_indexed", expr) ~ "indexed" - )), - beta = ifelse(grepl("beta_false", expr), FALSE, TRUE), - gen_num = case_when( - grepl("1gen", expr) ~ 1, - grepl("lowgen", expr) ~ Ngen, - grepl("midgen", expr) ~ Ngen * 2, - grepl("highgen", expr) ~ Ngen * 3 - ), - gen_factor = factor(gen_num, levels = c(1, Ngen, Ngen * 2, Ngen * 3)) - ) - -summary(benchmark_results) -lm(benchmark_results$time ~ benchmark_results$beta * benchmark_results$gen_num) %>% - summary() - -lm(benchmark_results$time ~ benchmark_results$beta) %>% - summary() -# log transform time for better visualization - -ggplot(benchmark_results, aes(x = gen_factor, y = time / 1e6, color = beta_factor)) + - geom_boxplot() + - labs( - title = "Benchmarking simulatePedigree() with and without beta parameter", - x = "Generation Size", - y = "Execution Time (ms)", - color = "Beta Parameter" - ) + - theme_minimal() + - scale_y_log10() diff --git a/data-raw/optimizing_simulations.R b/data-raw/optimizing_simulations.R new file mode 100644 index 00000000..be1f7464 --- /dev/null +++ b/data-raw/optimizing_simulations.R @@ -0,0 +1,106 @@ +library(profvis) +library(microbenchmark) +library(tidyverse) +set.seed(1667) +Ngen <- 3 +kpc <- 4 +sexR <- .50 # sometimes fails above .5 +marR <- .7 +reps <- 10 + +if (FALSE) { + profvis({ + simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = FALSE) + }) + + profvis({ + simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = TRUE) + }) +} +if (FALSE) { + benchmark_results <- microbenchmark( + beta_false_1gen = { + simulatePedigree(kpc = kpc, Ngen = 1, sexR = sexR, marR = marR, beta = FALSE) + }, + beta_true_1gen = { + simulatePedigree(kpc = kpc, Ngen = 1, sexR = sexR, marR = marR, beta = TRUE) + }, + beta_false_lowgen = { + simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = FALSE) + }, + beta_true_lowgen = { + simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = TRUE) + }, + beta_false_midgen = { + simulatePedigree(kpc = kpc, Ngen = Ngen * 2, sexR = sexR, marR = marR, beta = FALSE) + }, + beta_true_midgen = { + simulatePedigree(kpc = kpc, Ngen = Ngen * 2, sexR = sexR, marR = marR, beta = TRUE) + }, + beta_false_highgen = { + simulatePedigree(kpc = kpc, Ngen = Ngen * 3, sexR = sexR, marR = marR, beta = FALSE) + }, + beta_true_highgen = { + simulatePedigree(kpc = kpc, Ngen = Ngen * 3, sexR = sexR, marR = marR, beta = TRUE) + }, + times = reps # Run each method 10 times + ) + + benchmark_results <- benchmark_results %>% + mutate( + beta_factor = factor(case_when( + grepl("beta_true", expr) ~ "TRUE", + grepl("beta_false", expr) ~ "FALSE", + grepl("beta_indexed", expr) ~ "indexed" + )), + beta = ifelse(grepl("beta_false", expr), FALSE, TRUE), + gen_num = case_when( + grepl("1gen", expr) ~ 1, + grepl("lowgen", expr) ~ Ngen, + grepl("midgen", expr) ~ Ngen * 2, + grepl("highgen", expr) ~ Ngen * 3 + ), + gen_factor = factor(gen_num, levels = c(1, Ngen, Ngen * 2, Ngen * 3)) + ) + benchmark_results <- benchmark_results %>% + mutate( + beta_factor = factor(case_when( + grepl("beta_true", expr) ~ "TRUE", + grepl("beta_false", expr) ~ "FALSE", + grepl("beta_indexed", expr) ~ "indexed" + )), + beta = ifelse(grepl("beta_false", expr), FALSE, TRUE), + gen_num = case_when( + grepl("1gen", expr) ~ 1, + grepl("lowgen", expr) ~ Ngen, + grepl("midgen", expr) ~ Ngen * 2, + grepl("highgen", expr) ~ Ngen * 3 + ), + gen_factor = factor(gen_num, levels = c(1, Ngen, Ngen * 2, Ngen * 3)) + ) + + summary(benchmark_results) + lm(benchmark_results$time ~ benchmark_results$beta * benchmark_results$gen_num) %>% + summary() + summary(benchmark_results) + lm(benchmark_results$time ~ benchmark_results$beta * benchmark_results$gen_num) %>% + summary() + + lm(benchmark_results$time ~ benchmark_results$beta) %>% + summary() + # log transform time for better visualization + lm(benchmark_results$time ~ benchmark_results$beta) %>% + summary() + # log transform time for better visualization + + ggplot(benchmark_results, aes(x = gen_factor, y = time / 1e6, color = beta_factor)) + + geom_boxplot() + + labs( + title = "Benchmarking simulatePedigree() with and without beta parameter", + x = "Generation Size", + y = "Execution Time (ms)", + color = "Beta Parameter" + ) + + theme_minimal() + + scale_y_log10() +} diff --git a/data-raw/optimizing_twins.R b/data-raw/optimizing_twins.R new file mode 100644 index 00000000..f16495ad --- /dev/null +++ b/data-raw/optimizing_twins.R @@ -0,0 +1,382 @@ +library(profvis) +library(microbenchmark) +library(tidyverse) +devtools::load_all(".") + + +# --------------------------- +# 0) Config +# --------------------------- +cfg <- list( + seed = 1164127, + Ngen_base = 3, + reps = 10, + all_scenarios = FALSE, # set to TRUE to run all scenarios defined below + include_highgen = TRUE, + include_1gen = FALSE, + include_lowgen = FALSE +) +cfg$gen_twin <- ceiling(cfg$Ngen_base - 1) + +set.seed(cfg$seed) + +# --------------------------- +# 1) Levels (edit here to extend) +# --------------------------- +levels <- list( + ped = tibble( + ped_label = c( + if (cfg$include_1gen) "1gen", + if (cfg$include_lowgen) "lowgen", + "midgen", + if (cfg$include_highgen) "highgen" + ), + Ngen_total = c( + if (cfg$include_1gen) 1, + if (cfg$include_lowgen) cfg$Ngen_base, + cfg$Ngen_base * 2, + if (cfg$include_highgen) cfg$Ngen_base * 3 + ), + gen_twin = c( + if (cfg$include_1gen) 1, + if (cfg$include_lowgen) cfg$gen_twin, + cfg$gen_twin, + if (cfg$include_highgen) cfg$gen_twin + ) + # Add highgen row whenever you want + ), + + # Simulation-side factors (simulatePedigree) + kpc = 3, # set to c(2, 3, 4) to vary + sexR = 0.50, # sometimes fails above .5 + marR = c(0.8), # set to c(0.6, 0.8, 0.9) to vary + sim_beta = TRUE, # set to c(TRUE, FALSE) if you ever want to vary + + # Conversion-side factors (ped2com) + component = c("additive"), + twin_method = c("NULL", "addtwins", "merging"), + beta = c(FALSE, TRUE), + sparse_matrix = c(FALSE, TRUE) # user-facing name, translated to ped2com's `sparse` +) + +# Which columns define a unique simulation vs conversion condition +# If you add a new factor, put its name in the right scope here. + +scopes <- list( + sim = c("ped_label", "Ngen_total", "gen_twin", "kpc", "sexR", "marR", "sim_beta"), + conv = c("component", "twin_method", "beta", "sparse_matrix") +) + +# --------------------------- +# 2) Scenarios (edit here to control crossing) +# --------------------------- +# Each scenario says what to vary and what to fix. No other code changes. +scenarios <- list( + full = list( + vary = c("ped", "marR", "twin_method", "beta", "sparse_matrix"), + fixed = list() + ), + quick = list( + vary = c("ped", "twin_method"), + fixed = list(marR = 0.8, beta = FALSE, sparse_matrix = TRUE, component = "additive") + ) + + # Add more scenarios whenever you want: + # e.g., "marR_only" = list(vary=c("marR"), fixed=list(ped = levels$ped[levels$ped$ped_label=="midgen",], ...)) +) + +if (!cfg$all_scenarios) { + scenarios <- scenarios[c("full")] # order control; also, comment out any you don't want to run +} +# --------------------------- +# 3) Generic design builder (do not edit to add factors) +# --------------------------- +`%||%` <- function(x, y) if (!is.null(x)) x else y + +level_tbl <- function(name, x) { + if (inherits(x, "data.frame")) { + return(x) + } + tibble(!!name := x) +} + +default_value <- function(x) { + if (inherits(x, "data.frame")) { + return(x[1, , drop = FALSE]) + } + x[[1]] +} + +expand_scenario <- function(levels, vary, fixed = list(), scenario = "scenario") { + df <- tibble(.dummy = 1) + + # expand varied factors + for (nm in vary) { + df <- tidyr::crossing(df, level_tbl(nm, levels[[nm]])) + } + + # add fixed/default factors not varied + not_varied <- setdiff(names(levels), vary) + for (nm in not_varied) { + lv <- levels[[nm]] + + if (inherits(lv, "data.frame")) { + const <- fixed[[nm]] %||% default_value(lv) + if (!inherits(const, "data.frame") || nrow(const) != 1) const <- default_value(lv) + df <- bind_cols(df, const[rep(1, nrow(df)), , drop = FALSE]) + } else { + df[[nm]] <- fixed[[nm]] %||% default_value(lv) + } + } + + df %>% + select(-.dummy) %>% + mutate(scenario = scenario, .before = 1) +} + +build_design <- function(levels, scenarios) { + purrr::imap_dfr( + scenarios, + ~ expand_scenario(levels, vary = .x$vary, fixed = .x$fixed, scenario = .y) + ) +} + +design <- build_design(levels, scenarios) + +# Create a stable, joinable label (also becomes the microbenchmark expr name) +design <- design %>% + mutate( + label = paste0( + scenario, + "|ped=", ped_label, + "|marR=", marR, + "|twin=", twin_method, + "|beta=", beta, + "|sparse=", sparse_matrix, + "|comp=", component + ) + ) + +# --------------------------- +# 4) Simulation cache (simulate once per unique sim condition) +# --------------------------- +simulate_one <- function(Ngen_total, gen_twin, kpc, sexR, marR, sim_beta) { + simulatePedigree( + kpc = kpc, Ngen = Ngen_total, sexR = sexR, marR = marR, + beta = sim_beta + ) %>% + makeTwins(gen_twin = gen_twin) +} + +sim_tbl <- design %>% + distinct(across(all_of(scopes$sim))) %>% + mutate( + sim_id = row_number(), + ped = pmap( + list(Ngen_total, gen_twin, kpc, sexR, marR, sim_beta, sim_id), + function(Ngen_total, gen_twin, kpc, sexR, marR, sim_beta, sim_id) { + set.seed(cfg$seed + sim_id) + simulate_one(Ngen_total, gen_twin, kpc, sexR, marR, sim_beta) + } + ) + ) %>% + select(-sim_id) + +design <- design %>% left_join(sim_tbl, by = scopes$sim) + +# Put peds and args in keyed lists so benchmark expressions stay tiny +peds_by_label <- setNames(design$ped, design$label) + +# --------------------------- +# 5) Conversion arg translation (edit only if you add non-1:1 args) +# --------------------------- +# Columns that map directly to ped2com arg names go through automatically. +# Anything else gets a translation rule here. +special_to_args <- list( + twin_method = function(v) { + if (is.null(v) || length(v) == 0 || is.na(v) || v == "NULL") { + list(mz_twins = FALSE) + } else { + list(mz_twins = TRUE, mz_method = v) + } + }, + beta = function(v) list(beta = TRUE), + sparse_matrix = function(v) list(sparse = v) +) + +make_conv_args <- function(row, conv_cols) { + # row is already a named list of scalar values when called correctly + direct_cols <- setdiff(conv_cols, names(special_to_args)) + args <- row[direct_cols] + + for (nm in intersect(names(special_to_args), conv_cols)) { + args <- c(args, special_to_args[[nm]](row[[nm]])) + } + + args +} + +make_conv_args_row <- function(...) { + row <- list(...) + make_conv_args(row, scopes$conv) +} + +# Correct per-row args creation +args_by_label <- design %>% + mutate( + conv_args = pmap(select(., all_of(scopes$conv)), make_conv_args_row) + ) %>% + select(label, conv_args) %>% + deframe() + +# --------------------------- +# 6) One microbenchmark call with all expressions (correct behavior) +# --------------------------- +bench_exprs <- lapply(names(peds_by_label), function(lbl) { + bquote( + do.call(ped2com, c(list(ped = peds_by_label[[.(lbl)]]), args_by_label[[.(lbl)]])) + ) +}) +names(bench_exprs) <- names(peds_by_label) + +write_csv(design, "data-raw/ped2com_benchmark_design.csv") +# write start time +write.table(Sys.time(), "data-raw/ped2com_benchmark_start_time.txt", row.names = FALSE, col.names = FALSE) + + +benchmark_results <- do.call( + microbenchmark::microbenchmark, + c(bench_exprs, list(times = cfg$reps)) +) + +write.table(Sys.time(), "data-raw/ped2com_benchmark_end_time.txt", row.names = FALSE, col.names = FALSE) + +results <- as_tibble(benchmark_results) %>% + mutate(label = as.character(expr)) %>% + left_join( + design %>% select(-ped), + by = "label" + ) +# notes: sparse with addtwins is slow (878 vs 250), but sparse with merging is way times slower (14257 vs 241). This is a huge difference, and suggests that the merging method is not compatible with sparse matrices in its current form. The addtwins method is slower than NULL in both cases, but the difference is much more pronounced when using sparse matrices. The means of adding or substituting twins in the pedigree may interact with the way sparse matrices are constructed or handled in ped2com, leading to increased computational overhead. The NULL method is the fastest in both cases, which is expected since it does not involve any additional processing for twins. Overall, these results suggest that while the addtwins method can be used with sparse matrices, it may not be the most efficient choice, and the merging method may not be suitable for use with sparse matrices at all. +# --------------------------- +# 7) Analysis/plot +# --------------------------- +results <- results %>% + mutate( + twin_method = factor(twin_method, levels = c("NULL", "addtwins", "merging")), + ped_label = factor(ped_label, levels = levels$ped$ped_label), + gen_factor = factor(ped_label, levels = levels$ped$ped_label, labels = paste0(levels$ped$Ngen_total, " gen")) + ) +write_csv(results, "data-raw/ped2com_benchmark_results.csv") + +summary(results) +results %>% + group_by(ped_label, twin_method, sparse_matrix) %>% + summarise( + median_time_ms = median(time / 1e6), + mean_time_ms = mean(time / 1e6), + var_time_ms = var(time / 1e6), + sd_time_ms = sd(time / 1e6), + se_time_ms = sd(time / 1e6) / sqrt(n()), + n_time = n(), + .groups = "drop_last" + ) %>% + arrange(ped_label, twin_method, sparse_matrix) %>% + write_csv("data-raw/ped2com_benchmark_summary.csv") + +# ped_label twin_method sparse_matrix median_time_ms +# +# 7 highgen NULL FALSE 222. +# 8 highgen NULL TRUE 198. +# 9 highgen addtwins FALSE 250. +# 10 highgen addtwins TRUE 878. +# 11 highgen merging FALSE 241. +# 12 highgen merging TRUE 14257. + +if (cfg$reps > 8) { + notch <- FALSE +} else { + notch <- FALSE +} + +results %>% + # dplyr::filter(!ped_label %in% c("1gen", "lowgen", "midgen")) %>% + mutate( + beta_sparse = paste0("beta=", beta, ", sparse=", sparse_matrix), + beta_sparse = factor(beta_sparse, levels = c( + "beta=FALSE, sparse=FALSE", + "beta=FALSE, sparse=TRUE", + "beta=TRUE, sparse=FALSE", + "beta=TRUE, sparse=TRUE" + )) + ) %>% + ggplot( + aes( + x = ped_label, + y = time / 1e6, + fill = twin_method, + color = sparse_matrix # , + # shape = beta + ) + ) + + geom_boxplot( + notch = notch, position = position_dodge(width = 0.8), outlier.size = 0.5, + linewidth = 0.25 + ) + + scale_y_log10() + + facet_grid(~scenario) + + labs( + title = "Benchmarking ped2com() by twin handling and beta option", + x = "Pedigree", y = "Execution time (ms)", color = "sparse_matrix", fill = "Twin method" + ) + + theme_minimal() #+ +# scale_fill_manual(values = c( +# "beta=FALSE, sparse=FALSE" = "lightgray", +# "beta=FALSE, sparse=TRUE" = "gray8", +# "beta=TRUE, sparse=FALSE" = "lightcoral", +# "beta=TRUE, sparse=TRUE" = "red2" +# )) + +# scale_color_manual(values = c("NULL" = "gray", "addtwins" = "skyblue3", "merging" = "tomato2")) +# df <- sim_tbl$ped[[2]] + +# df_add <- df %>% ped2add() + +if (FALSE) { + profvis( + { + sim_tbl$ped[[1]] %>% ped2com(component = "additive", mz_twins = TRUE, mz_method = "addtwins", beta = TRUE, sparse = TRUE) + }, + interval = 0.01 + ) + + + profvis( + { + sim_tbl$ped[[1]] %>% ped2com(component = "additive", mz_twins = TRUE, mz_method = "merging", beta = TRUE, sparse = FALSE) + }, + interval = 0.01 + ) + # could it be the interaction between NA and gc?? + test_df_twins <- sim_tbl$ped[[1]] %>% fuseTwins(test_df_twins = TRUE) + foundreturnedall <- findMZtwins(sim_tbl$ped[[1]], + verbose = F, + returnRows = TRUE, returnIDs = TRUE, returnAsList = T + ) + foundreturnedrows <- findMZtwins(sim_tbl$ped[[1]], + verbose = F, + returnRows = TRUE, returnIDs = FALSE, returnAsList = T + ) + foundreturnedids <- findMZtwins(sim_tbl$ped[[1]], + verbose = F, + returnRows = FALSE, returnIDs = TRUE, returnAsList = T + ) +} +if (FALSE) { + # Example of how to run a single benchmark for one condition + profvis( + { + sim_tbl$ped[[1]] %>% fuseTwins(mz_id_pairs = NULL, mz_row_pairs = NULL) + }, + interval = 0.01 + ) +} diff --git a/data-raw/ped2com_benchmark_design.csv b/data-raw/ped2com_benchmark_design.csv new file mode 100644 index 00000000..593138ce --- /dev/null +++ b/data-raw/ped2com_benchmark_design.csv @@ -0,0 +1,25 @@ +scenario,ped_label,Ngen_total,gen_twin,marR,twin_method,beta,sparse_matrix,kpc,sexR,sim_beta,component,label,ped +full,highgen,9,2,0.8,addtwins,FALSE,FALSE,3,0.5,TRUE,additive,full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive, +full,highgen,9,2,0.8,addtwins,FALSE,TRUE,3,0.5,TRUE,additive,full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive, +full,highgen,9,2,0.8,addtwins,TRUE,FALSE,3,0.5,TRUE,additive,full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive, +full,highgen,9,2,0.8,addtwins,TRUE,TRUE,3,0.5,TRUE,additive,full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive, +full,highgen,9,2,0.8,merging,FALSE,FALSE,3,0.5,TRUE,additive,full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive, +full,highgen,9,2,0.8,merging,FALSE,TRUE,3,0.5,TRUE,additive,full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive, +full,highgen,9,2,0.8,merging,TRUE,FALSE,3,0.5,TRUE,additive,full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive, +full,highgen,9,2,0.8,merging,TRUE,TRUE,3,0.5,TRUE,additive,full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive, +full,highgen,9,2,0.8,NULL,FALSE,FALSE,3,0.5,TRUE,additive,full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive, +full,highgen,9,2,0.8,NULL,FALSE,TRUE,3,0.5,TRUE,additive,full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive, +full,highgen,9,2,0.8,NULL,TRUE,FALSE,3,0.5,TRUE,additive,full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive, +full,highgen,9,2,0.8,NULL,TRUE,TRUE,3,0.5,TRUE,additive,full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive, +full,midgen,6,2,0.8,addtwins,FALSE,FALSE,3,0.5,TRUE,additive,full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive, +full,midgen,6,2,0.8,addtwins,FALSE,TRUE,3,0.5,TRUE,additive,full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive, +full,midgen,6,2,0.8,addtwins,TRUE,FALSE,3,0.5,TRUE,additive,full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive, +full,midgen,6,2,0.8,addtwins,TRUE,TRUE,3,0.5,TRUE,additive,full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive, +full,midgen,6,2,0.8,merging,FALSE,FALSE,3,0.5,TRUE,additive,full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive, +full,midgen,6,2,0.8,merging,FALSE,TRUE,3,0.5,TRUE,additive,full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive, +full,midgen,6,2,0.8,merging,TRUE,FALSE,3,0.5,TRUE,additive,full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive, +full,midgen,6,2,0.8,merging,TRUE,TRUE,3,0.5,TRUE,additive,full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive, +full,midgen,6,2,0.8,NULL,FALSE,FALSE,3,0.5,TRUE,additive,full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive, +full,midgen,6,2,0.8,NULL,FALSE,TRUE,3,0.5,TRUE,additive,full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive, +full,midgen,6,2,0.8,NULL,TRUE,FALSE,3,0.5,TRUE,additive,full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive, +full,midgen,6,2,0.8,NULL,TRUE,TRUE,3,0.5,TRUE,additive,full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive, diff --git a/data-raw/ped2com_benchmark_end_time.txt b/data-raw/ped2com_benchmark_end_time.txt new file mode 100644 index 00000000..ae7f71ae --- /dev/null +++ b/data-raw/ped2com_benchmark_end_time.txt @@ -0,0 +1 @@ +2026-02-16 10:47:50.923385 diff --git a/data-raw/ped2com_benchmark_results.csv b/data-raw/ped2com_benchmark_results.csv new file mode 100644 index 00000000..c26e7302 --- /dev/null +++ b/data-raw/ped2com_benchmark_results.csv @@ -0,0 +1,241 @@ +expr,time,label,scenario,ped_label,Ngen_total,gen_twin,marR,twin_method,beta,sparse_matrix,kpc,sexR,sim_beta,component,gen_factor +full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,1097588100,full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,merging,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,1427187500,full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,addtwins,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,208481200,full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,NULL,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,6189200,full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,NULL,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,200689000,full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,NULL,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,27103200,full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,addtwins,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,40300200,full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,merging,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,6026000,full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,NULL,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,227563000,full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,NULL,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,7928900,full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,addtwins,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,205996300,full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,NULL,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,7277200,full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,NULL,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,7109900,full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,NULL,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,238252400,full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,NULL,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,8026300,full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,merging,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,286128500,full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,addtwins,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,1445191500,full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,addtwins,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,1254251600,full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,addtwins,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,6127200,full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,NULL,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,10656600,full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,addtwins,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,9625200,full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,merging,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,8027400,full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,merging,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,11509500,full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,addtwins,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,7426000,full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,addtwins,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,443970200,full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,addtwins,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,241512000,full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,addtwins,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,452599400,full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,addtwins,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,11105000,full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,merging,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,7619100,full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,NULL,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,244079300,full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,merging,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,902782300,full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,addtwins,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,454902600,full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,addtwins,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,234666900,full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,NULL,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,7181500,full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,NULL,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,8527000,full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,addtwins,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,212780500,full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,NULL,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,1100157100,full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,merging,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,231629800,full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,merging,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,245438700,full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,NULL,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,9059000,full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,merging,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,214617000,full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,NULL,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,6712400,full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,NULL,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,445690600,full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,addtwins,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,235071900,full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,addtwins,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,253957100,full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,merging,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,1028123600,full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,addtwins,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,10129800,full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,merging,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,232238800,full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,NULL,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,399899300,full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,NULL,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,266978400,full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,addtwins,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,12141300,full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,addtwins,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,7296500,full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,NULL,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,262951300,full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,merging,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,8368100,full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,addtwins,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,8787900,full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,merging,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,13507800,full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,merging,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,7670000,full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,NULL,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,261880300,full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,addtwins,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,420694500,full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,merging,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,6956400,full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,NULL,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,10659100,full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,addtwins,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,6674800,full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,NULL,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,415774000,full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,merging,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,7061000,full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,NULL,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,8168300,full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,addtwins,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,12491500,full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,addtwins,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,859762100,full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,merging,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,10746300,full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,merging,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,7435100,full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,NULL,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,10309000,full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,merging,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,1154719100,full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,addtwins,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,686766200,full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,merging,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,8156100,full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,addtwins,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,1250916100,full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,addtwins,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,720033600,full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,merging,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,227784000,full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,NULL,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,210911700,full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,NULL,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,8514700,full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,merging,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,209204900,full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,NULL,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,456020500,full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,merging,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,14974900,full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,addtwins,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,13465800,full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,addtwins,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,194586400,full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,NULL,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,226957000,full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,addtwins,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,182487000,full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,NULL,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,8091400,full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,NULL,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,666622800,full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,merging,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,7738000,full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,merging,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,7558800,full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,merging,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,506063000,full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,merging,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,428734800,full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,addtwins,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,8621300,full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,addtwins,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,11728100,full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,addtwins,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,12375600,full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,addtwins,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,8772300,full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,addtwins,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,12003600,full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,addtwins,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,824795700,full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,addtwins,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,7323800,full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,NULL,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,245685100,full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,NULL,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,417243300,full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,merging,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,9535600,full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,merging,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,7172700,full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,NULL,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,7072900,full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,NULL,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,8495100,full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,addtwins,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,238015500,full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,addtwins,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,7356900,full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,merging,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,515113200,full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,merging,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,703907500,full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,merging,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,7065900,full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,NULL,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,7029700,full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,NULL,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,223874400,full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,merging,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,234624100,full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,merging,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,10971500,full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,addtwins,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,9505400,full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,merging,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,8480900,full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,addtwins,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,701840500,full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,merging,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,8603400,full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,merging,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,1366937400,full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,addtwins,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,12457400,full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,addtwins,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,7429200,full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,NULL,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,221505800,full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,NULL,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,8338300,full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,NULL,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,7587900,full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,merging,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,260293200,full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,addtwins,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,6970500,full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,NULL,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,387071800,full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,NULL,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,7560700,full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,NULL,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,7024200,full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,merging,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,7572400,full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,addtwins,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,679198200,full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,merging,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,7570400,full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,NULL,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,8952100,full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,addtwins,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,220476700,full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,NULL,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,1324076900,full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,addtwins,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,870241700,full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,merging,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,236028200,full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,addtwins,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,241456900,full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,addtwins,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,6418900,full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,NULL,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,193984200,full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,NULL,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,174968700,full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,NULL,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,9338700,full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,addtwins,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,237393900,full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,NULL,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,8107300,full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,NULL,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,7594300,full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,merging,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,214633800,full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,NULL,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,223070000,full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,NULL,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,1162039400,full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,addtwins,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,6900500,full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,NULL,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,9390800,full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,addtwins,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,6822900,full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,addtwins,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,6129800,full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,NULL,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,419321700,full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,merging,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,239979200,full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,merging,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,246306700,full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,merging,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,413186000,full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,addtwins,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,945828200,full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,addtwins,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,212045300,full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,NULL,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,516998700,full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,merging,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,7050200,full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,NULL,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,185658200,full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,NULL,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,195559600,full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,NULL,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,10013400,full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,merging,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,244554400,full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,merging,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,10914100,full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,addtwins,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,260740500,full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,merging,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,696082600,full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,merging,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,991407800,full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,addtwins,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,183159200,full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,NULL,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,250327800,full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,merging,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,11682700,full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,addtwins,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,972089100,full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,addtwins,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,7223900,full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,NULL,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,7554000,full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,addtwins,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,248708600,full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,addtwins,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,414717900,full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,merging,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,234125500,full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,addtwins,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,185366000,full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,NULL,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,1164331700,full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,addtwins,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,192493700,full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,NULL,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,1004205400,full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,addtwins,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,409725900,full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,NULL,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,9967800,full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,addtwins,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,7054100,full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,NULL,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,7931100,full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,merging,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,9482300,full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,merging,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,240506300,full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,addtwins,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,178059100,full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,NULL,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,9036700,full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,merging,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,8792900,full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,merging,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,515438000,full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,merging,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,7878200,full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,merging,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,6172500,full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,merging,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,678646600,full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,merging,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,12139500,full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,addtwins,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,705609800,full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,merging,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,10250500,full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,merging,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,8626400,full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,addtwins,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,669870100,full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,merging,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,8373900,full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,addtwins,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,7465900,full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,merging,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,7766100,full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,merging,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,8029300,full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,addtwins,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,9360000,full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,merging,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,8011500,full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,merging,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,5451100,full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,NULL,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,181073500,full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,NULL,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,708658700,full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,merging,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,458115900,full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,addtwins,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,212310700,full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,NULL,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,6463400,full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,NULL,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,1134949700,full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,addtwins,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,10318000,full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,addtwins,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,6561300,full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,NULL,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,415959500,full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,merging,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,10837000,full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,addtwins,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,253422200,full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,merging,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,11325500,full|ped=midgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,addtwins,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,498453800,full|ped=highgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,merging,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,6816400,full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,NULL,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,9532900,full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,merging,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,7317400,full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,NULL,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,815704200,full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,addtwins,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,8478700,full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,merging,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,7668400,full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,merging,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,189231300,full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,NULL,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,970275100,full|ped=highgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,addtwins,FALSE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,224921500,full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,NULL,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,823923000,full|ped=highgen|marR=0.8|twin=addtwins|beta=TRUE|sparse=TRUE|comp=additive,full,highgen,9,2,0.8,addtwins,TRUE,TRUE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,8553200,full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,merging,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,203815400,full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,NULL,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,8441500,full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,merging,FALSE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,239606300,full|ped=highgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,merging,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,434145700,full|ped=highgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,NULL,FALSE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,7041200,full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,NULL,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,7063600,full|ped=midgen|marR=0.8|twin=merging|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,merging,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,206283600,full|ped=highgen|marR=0.8|twin=NULL|beta=TRUE|sparse=FALSE|comp=additive,full,highgen,9,2,0.8,NULL,TRUE,FALSE,3,0.5,TRUE,additive,9 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,6966300,full|ped=midgen|marR=0.8|twin=NULL|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,NULL,FALSE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,6370100,full|ped=midgen|marR=0.8|twin=NULL|beta=TRUE|sparse=TRUE|comp=additive,full,midgen,6,2,0.8,NULL,TRUE,TRUE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,6395900,full|ped=midgen|marR=0.8|twin=merging|beta=TRUE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,merging,TRUE,FALSE,3,0.5,TRUE,additive,6 gen +full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,7256300,full|ped=midgen|marR=0.8|twin=addtwins|beta=FALSE|sparse=FALSE|comp=additive,full,midgen,6,2,0.8,addtwins,FALSE,FALSE,3,0.5,TRUE,additive,6 gen diff --git a/data-raw/ped2com_benchmark_start_time.txt b/data-raw/ped2com_benchmark_start_time.txt new file mode 100644 index 00000000..91c3edb0 --- /dev/null +++ b/data-raw/ped2com_benchmark_start_time.txt @@ -0,0 +1 @@ +2026-02-16 10:46:51.927423 diff --git a/data-raw/ped2com_benchmark_summary.csv b/data-raw/ped2com_benchmark_summary.csv new file mode 100644 index 00000000..5fd17eb4 --- /dev/null +++ b/data-raw/ped2com_benchmark_summary.csv @@ -0,0 +1,13 @@ +ped_label,twin_method,sparse_matrix,median_time_ms,mean_time_ms,var_time_ms,sd_time_ms,se_time_ms,n_time +midgen,NULL,FALSE,7.08545,7.065175,0.38096159250000006,0.6172208620097024,0.13801478045847118,20 +midgen,NULL,TRUE,7.010350000000001,6.97651,0.3269079230526316,0.5717586230680143,0.1278491147901759,20 +midgen,addtwins,FALSE,8.371,8.478245,1.852214922605263,1.360961029054566,0.30432013756940757,20 +midgen,addtwins,TRUE,11.5961,12.25093,13.742667178000001,3.7071103541707524,0.8289350752019123,20 +midgen,merging,FALSE,7.703200000000001,7.761935,0.6603589255526314,0.81262471384559265,0.18170841003550597,20 +midgen,merging,TRUE,9.5205,11.184985,48.470337688710515,6.962064183035841,1.556764877698468,20 +highgen,NULL,FALSE,225.3165,243.54288,3892.258395726947,62.38796675423031,13.950373464045589,20 +highgen,NULL,TRUE,194.2853,214.36891,3924.8415531188416,62.648555874168736,14.008642962683505,20 +highgen,addtwins,FALSE,261.08675,315.74309,9358.837543218842,96.74108508394374,21.631964246478915,20 +highgen,addtwins,TRUE,1081.53665,1098.186765,40060.529971101336,200.15126772294332,44.75518404112608,20 +highgen,merging,FALSE,253.68965,307.289225,7715.235275696711,87.83641201515867,19.640818816557406,20 +highgen,merging,TRUE,691.4244,704.852615,29254.359534471863,171.03905850557018,38.24549616260185,20 diff --git a/data-raw/twin_methods_sparse.png b/data-raw/twin_methods_sparse.png new file mode 100644 index 00000000..d69e5a0a Binary files /dev/null and b/data-raw/twin_methods_sparse.png differ diff --git a/man/adjustKidsPerCouple.Rd b/man/adjustKidsPerCouple.Rd index bf5b8192..dbe7e500 100644 --- a/man/adjustKidsPerCouple.Rd +++ b/man/adjustKidsPerCouple.Rd @@ -21,7 +21,18 @@ value is 3. Returns an error when kpc equals 1.} generated from a poisson distribution with mean kpc. If FALSE, the number of kids per mate will be fixed at kpc.} -\item{beta}{logical. If TRUE, use the optimized version of the algorithm.} +\item{beta}{logical or character. Controls which algorithm version to use: +\itemize{ + \item{\code{FALSE}, \code{"base"}, or \code{"original"} (default): Use the original algorithm. + Slower but ensures exact reproducibility with set.seed().} + \item{\code{TRUE} or \code{"optimized"}: Use the optimized algorithm with 4-5x speedup. + Produces statistically equivalent results but not identical to base version + due to different random number consumption. Recommended for large simulations + where speed matters more than exact reproducibility.} +} +Note: Both versions are mathematically correct and produce valid pedigrees with the +same statistical properties (sex ratios, mating rates, etc.). The optimized version +uses vectorized operations instead of loops, making it much faster for large pedigrees.} } \value{ A numeric vector with the generated or adjusted number of kids per couple. diff --git a/man/buildBetweenGenerations.Rd b/man/buildBetweenGenerations.Rd index fa87604e..f28622b2 100644 --- a/man/buildBetweenGenerations.Rd +++ b/man/buildBetweenGenerations.Rd @@ -62,7 +62,18 @@ kids per mate will be fixed at kpc.} \item{code_female}{The value to use for females. Default is "F"} -\item{beta}{logical. If TRUE, use the optimized version of the algorithm.} +\item{beta}{logical or character. Controls which algorithm version to use: +\itemize{ + \item{\code{FALSE}, \code{"base"}, or \code{"original"} (default): Use the original algorithm. + Slower but ensures exact reproducibility with set.seed().} + \item{\code{TRUE} or \code{"optimized"}: Use the optimized algorithm with 4-5x speedup. + Produces statistically equivalent results but not identical to base version + due to different random number consumption. Recommended for large simulations + where speed matters more than exact reproducibility.} +} +Note: Both versions are mathematically correct and produce valid pedigrees with the +same statistical properties (sex ratios, mating rates, etc.). The optimized version +uses vectorized operations instead of loops, making it much faster for large pedigrees.} } \value{ The function updates the `df_Fam` data frame in place, adding or modifying columns related to parental and offspring status, diff --git a/man/buildFamilyGroups.Rd b/man/buildFamilyGroups.Rd new file mode 100644 index 00000000..731914ea --- /dev/null +++ b/man/buildFamilyGroups.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/buildmxPedigrees.R +\name{buildFamilyGroups} +\alias{buildFamilyGroups} +\title{Build family group models} +\usage{ +buildFamilyGroups( + dat, + ytemp, + Addmat = NULL, + Nucmat = NULL, + Extmat = NULL, + Mtdmat = NULL, + Amimat = NULL, + Dmgmat = NULL, + prefix = "fam" +) +} +\arguments{ +\item{dat}{A data frame where each row represents a family group and columns correspond to observed variables.} + +\item{ytemp}{A vector of variable names corresponding to the observed data.} + +\item{Addmat}{Additive genetic relatedness matrix.} + +\item{Nucmat}{Nuclear family shared environment relatedness matrix.} + +\item{Extmat}{Extended family shared environment relatedness matrix.} + +\item{Mtdmat}{Mitochondrial genetic relatedness matrix.} + +\item{Amimat}{Additive by mitochondrial interaction relatedness matrix.} + +\item{Dmgmat}{Dominance genetic relatedness matrix.} + +\item{prefix}{A prefix for naming the family groups. Default is "fam".} +} +\value{ +A list of OpenMx models for each family group. +} +\description{ +This function constructs OpenMx models for multiple family groups based on +provided relatedness matrices and observed data. +} diff --git a/man/buildOneFamilyGroup.Rd b/man/buildOneFamilyGroup.Rd new file mode 100644 index 00000000..eaf5da63 --- /dev/null +++ b/man/buildOneFamilyGroup.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/buildmxPedigrees.R +\name{buildOneFamilyGroup} +\alias{buildOneFamilyGroup} +\title{Build one family group model} +\usage{ +buildOneFamilyGroup( + group_name, + Addmat = NULL, + Nucmat = NULL, + Extmat = NULL, + Mtdmat = NULL, + Amimat = NULL, + Dmgmat = NULL, + full_df_row, + ytemp +) +} +\arguments{ +\item{group_name}{Name of the family group.} + +\item{Addmat}{Additive genetic relatedness matrix (from \code{\link{ped2add}}).} + +\item{Nucmat}{Nuclear family shared environment relatedness matrix (from \code{\link{ped2cn}}).} + +\item{Extmat}{Extended family shared environment indicator. When non-NULL, +a common-extended-environment term using a unit matrix is included.} + +\item{Mtdmat}{Mitochondrial genetic relatedness matrix (from \code{\link{ped2mit}}).} + +\item{Amimat}{Additive by mitochondrial interaction relatedness matrix.} + +\item{Dmgmat}{Dominance genetic relatedness matrix.} + +\item{full_df_row}{A 1-row matrix of observed data with column names matching \code{ytemp}.} + +\item{ytemp}{A character vector of variable names corresponding to the observed data columns.} +} +\value{ +An OpenMx model for the specified family group. +} +\description{ +This function constructs an OpenMx model for a single family group based on +provided relatedness matrices and observed data. The implied covariance +is built as a weighted sum of the supplied relatedness matrices, where +the weights are variance component parameters shared across groups via +a parent \code{ModelOne} sub-model. +} diff --git a/man/buildPedigreeModelCovariance.Rd b/man/buildPedigreeModelCovariance.Rd new file mode 100644 index 00000000..a454b666 --- /dev/null +++ b/man/buildPedigreeModelCovariance.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/buildmxPedigrees.R +\name{buildPedigreeModelCovariance} +\alias{buildPedigreeModelCovariance} +\title{Create an mxModel for a pedigree} +\usage{ +buildPedigreeModelCovariance( + vars = list(ad2 = 0.5, dd2 = 0.3, cn2 = 0.2, ce2 = 0.4, mt2 = 0.1, am2 = 0.25, ee2 = + 0.6), + Vad = TRUE, + Vdd = FALSE, + Vcn = TRUE, + Vce = TRUE, + Vmt = TRUE, + Vam = FALSE, + Ver = TRUE +) +} +\arguments{ +\item{vars}{A named list or vector of initial variance component values. Names should include +ad2 (additive), dd2 (dominance), cn2 (common nuclear), ce2 (common extended), +mt2 (mitochondrial), am2 (additive-mitochondrial interaction), and ee2 (unique environment). +Default values are provided.} + +\item{Vad}{Logical. Include additive genetic variance component. Default is TRUE.} + +\item{Vdd}{Logical. Include dominance genetic variance component. Default is FALSE.} + +\item{Vcn}{Logical. Include common nuclear family environment variance component. Default is TRUE.} + +\item{Vce}{Logical. Include common extended family environment variance component. Default is TRUE.} + +\item{Vmt}{Logical. Include mitochondrial genetic variance component. Default is TRUE.} + +\item{Vam}{Logical. Include additive by mitochondrial interaction variance component. Default is FALSE.} + +\item{Ver}{Logical. Include unique environmental variance component. Default is TRUE.} +} +\value{ +An OpenMx model representing the pedigree with specified variance components. +} +\description{ +This function builds an OpenMx model for a pedigree with specified variance components. It requires the OpenMx package. +} diff --git a/man/buildPedigreeMx.Rd b/man/buildPedigreeMx.Rd new file mode 100644 index 00000000..24c2d95b --- /dev/null +++ b/man/buildPedigreeMx.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/buildmxPedigrees.R +\name{buildPedigreeMx} +\alias{buildPedigreeMx} +\title{Build Pedigree mxModel} +\usage{ +buildPedigreeMx(model_name, vars, group_models) +} +\arguments{ +\item{model_name}{Name of the overall pedigree model.} + +\item{vars}{A named list or vector of initial variance component values.} + +\item{group_models}{A list of OpenMx models for each family group.} +} +\value{ +An OpenMx pedigree model combining variance components and family groups. +} +\description{ +This function constructs an OpenMx pedigree model by combining variance +component parameters and family group models. It auto-detects which +variance components are referenced in the group algebras and creates +only those parameters. +} diff --git a/man/buildWithinGenerations.Rd b/man/buildWithinGenerations.Rd index 905e42ad..cbddf27a 100644 --- a/man/buildWithinGenerations.Rd +++ b/man/buildWithinGenerations.Rd @@ -20,7 +20,18 @@ buildWithinGenerations( ) } \arguments{ -\item{beta}{logical. If TRUE, use the optimized version of the algorithm.} +\item{beta}{logical or character. Controls which algorithm version to use: +\itemize{ + \item{\code{FALSE}, \code{"base"}, or \code{"original"} (default): Use the original algorithm. + Slower but ensures exact reproducibility with set.seed().} + \item{\code{TRUE} or \code{"optimized"}: Use the optimized algorithm with 4-5x speedup. + Produces statistically equivalent results but not identical to base version + due to different random number consumption. Recommended for large simulations + where speed matters more than exact reproducibility.} +} +Note: Both versions are mathematically correct and produce valid pedigrees with the +same statistical properties (sex ratios, mating rates, etc.). The optimized version +uses vectorized operations instead of loops, making it much faster for large pedigrees.} \item{sizeGens}{A numeric vector containing the sizes of each generation within the pedigree.} diff --git a/man/dot-adjBeta.Rd b/man/dot-adjBeta.Rd index 5b962868..3c572806 100644 --- a/man/dot-adjBeta.Rd +++ b/man/dot-adjBeta.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/constructAdjacency.R \name{.adjBeta} \alias{.adjBeta} -\title{Construct Adjacency Matrix for Parent-Child Relationships Using Beta Method +\title{Construct Adjacency Matrix for Parent-Child Relationships Using Beta Methods This function constructs an adjacency matrix for parent-child relationships using a method in beta testing. It identifies parent-child pairs based on the specified component of relatedness.} @@ -60,7 +60,7 @@ specified component of relatedness.} \item{...}{additional arguments to be passed to \code{\link{ped2com}}} } \description{ -Construct Adjacency Matrix for Parent-Child Relationships Using Beta Method +Construct Adjacency Matrix for Parent-Child Relationships Using Beta Methods This function constructs an adjacency matrix for parent-child relationships using a method in beta testing. It identifies parent-child pairs based on the specified component of relatedness. diff --git a/man/dropLink.Rd b/man/dropLink.Rd index 668d9e82..2753587d 100644 --- a/man/dropLink.Rd +++ b/man/dropLink.Rd @@ -9,7 +9,8 @@ dropLink( ID_drop = NA_integer_, gen_drop = 2, sex_drop = NA_character_, - n_drop = 1 + n_drop = 1, + verbose = FALSE ) } \arguments{ @@ -22,6 +23,8 @@ dropLink( \item{sex_drop}{the biological sex of the randomly dropped person.} \item{n_drop}{the number of times the mutation happens.} + +\item{verbose}{logical. If TRUE, print progress through stages of algorithm} } \value{ a pedigree with the dropped person's `dadID` and `momID` set to NA. diff --git a/man/findMZtwins.Rd b/man/findMZtwins.Rd new file mode 100644 index 00000000..39a361fd --- /dev/null +++ b/man/findMZtwins.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helpTwins.R +\name{findMZtwins} +\alias{findMZtwins} +\title{Find MZ twin pair_rows in a pedigree} +\usage{ +findMZtwins( + ped, + verbose = FALSE, + returnRows = TRUE, + returnIDs = FALSE, + returnAsList = TRUE, + beta = FALSE +) +} +\arguments{ +\item{ped}{A pedigree data.frame with columns \code{ID} and \code{twinID}. +Optionally a \code{zygosity} column; when present only pair_rows where both +members have \code{zygosity == "MZ"} are used.} + +\item{verbose}{logical. If TRUE, print progress messages.} + +\item{returnRows}{logical. If TRUE, return the row indices of the twin pair_rows instead of IDs.} + +\item{returnIDs}{logical. If TRUE, return the IDs of the twin pair_rows instead of row indices.} + +\item{returnAsList}{logical. If TRUE, return results as a list of vectors +(default). If FALSE, return results as a data.frame with separate columns for each twin's ID and row index. +@param beta logical. If TRUE, use an optimized approach with O(1) lookups for large pedigrees. If FALSE (default), use a simpler approach that may be less efficient for large pedigrees.} +} +\value{ +A list of length-2 integer vectors \code{c(idx1, idx2)} giving the + row indices of each MZ pair in the pedigree, or \code{NULL} if none found. +} +\description{ +Identifies MZ twin pair_rows from the \code{twinID} column and returns their +row indices. These indices are used later to merge the twins' columns in +the \code{r2} matrix before \code{tcrossprod}, which correctly produces +relatedness 1 between MZ co-twins with no diagonal or downstream artifacts. +} +\keyword{internal} diff --git a/man/fitPedigreeModel.Rd b/man/fitPedigreeModel.Rd new file mode 100644 index 00000000..e8b93f02 --- /dev/null +++ b/man/fitPedigreeModel.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/buildmxPedigrees.R +\name{fitPedigreeModel} +\alias{fitPedigreeModel} +\title{Fit an OpenMx pedigree model to observed data} +\usage{ +fitPedigreeModel( + model_name = "PedigreeModel", + vars = list(ad2 = 0.5, dd2 = 0.3, cn2 = 0.2, ce2 = 0.4, mt2 = 0.1, am2 = 0.25, ee2 = + 0.6), + data = NULL, + group_models = NULL, + tryhard = TRUE, + Addmat = NULL, + Nucmat = NULL, + Extmat = NULL, + Mtdmat = NULL, + Amimat = NULL, + Dmgmat = NULL +) +} +\arguments{ +\item{model_name}{Character. Name for the overall OpenMx model. Default is "PedigreeModel".} + +\item{vars}{A named list or vector of initial variance component values.} + +\item{data}{A matrix or data frame of observed data, where each row is a family +and columns correspond to individuals. Only used when \code{group_models} is NULL.} + +\item{group_models}{Optional list of pre-built OpenMx family group models +(from \code{\link{buildOneFamilyGroup}}). If NULL, they are generated from \code{data} +using the provided relatedness matrices.} + +\item{tryhard}{Logical. If TRUE (default), use \code{mxTryHard} for robust optimization; +if FALSE, use \code{mxRun}.} + +\item{Addmat}{Additive genetic relatedness matrix. Required when \code{group_models} is NULL.} + +\item{Nucmat}{Common nuclear environment relatedness matrix. Optional.} + +\item{Extmat}{Common extended environment relatedness matrix. Optional.} + +\item{Mtdmat}{Mitochondrial relatedness matrix. Optional.} + +\item{Amimat}{Additive-by-mitochondrial interaction matrix. Optional.} + +\item{Dmgmat}{Dominance genetic relatedness matrix. Optional.} +} +\value{ +A fitted OpenMx model. +} +\description{ +This function constructs and fits an OpenMx model for a pedigree using +specified variance components and family group models. +} diff --git a/man/fuseTwins.Rd b/man/fuseTwins.Rd new file mode 100644 index 00000000..42eb49fa --- /dev/null +++ b/man/fuseTwins.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helpTwins.R +\name{fuseTwins} +\alias{fuseTwins} +\title{Fuse MZ twin pairs in a pedigree dataset for path tracing +This function identifies MZ twin pairs in the pedigree dataset and merges their IDs for path tracing purposes. The second twin in each pair is made a founder (with NA parents), and all children of the second twin are redirected to the first twin. This allows for correct relatedness calculations without diagonal or downstream artifacts.} +\usage{ +fuseTwins( + ped, + df_twins = NULL, + mz_id_pairs = NULL, + mz_row_pairs = NULL, + config = list(verbose = FALSE), + test_df_twins = FALSE, + beta = FALSE +) +} +\arguments{ +\item{ped}{A pedigree data.frame with columns \code{ID}, \code{momID}, \code{dadID}, and optionally \code{twinID} and \code{zygosity}. The function will look for MZ twin pairs based on the \code{twinID} column and optionally restrict to MZ pairs if a \code{zygosity} column is present.} + +\item{df_twins}{Optional data frame with columns \code{twin1_id}, \code{twin2_id}, \code{twin1_row}, and \code{twin2_row} specifying the IDs and row indices of MZ twin pairs to fuse. If provided, this will be used instead of automatically identifying MZ twins from the \code{twinID} column. If this parameter is provided, it takes precedence over \code{mz_id_pairs} and \code{mz_row_pairs}. If \code{test_df_twins} is TRUE, this data frame will be returned for testing purposes instead of performing the fusion.} + +\item{mz_id_pairs}{Optional list of length-2 character vectors specifying the IDs of MZ twin pairs to fuse. If provided, this will be used instead of automatically identifying MZ twins from the \code{twinID} column. Each element should be a character vector of length 2, e.g. \code{list(c("ID1", "ID2"), c("ID3", "ID4"))}.} + +\item{mz_row_pairs}{Optional list of length-2 integer vectors specifying the row indices of MZ twin pairs to fuse. If provided, this will be used instead of automatically identifying MZ twins from the \code{twinID} column. Each element should be an integer vector of length 2, e.g. \code{list(c(1, 2), c(3, 4))}.} + +\item{config}{A list of configuration options.} + +\item{test_df_twins}{logical. If TRUE, return the data frame of twin pairs instead of the modified pedigree. Default is FALSE.} + +\item{beta}{logical. If TRUE, use an optimized approach with O(1) lookups for large pedigrees when identifying MZ twins. Default is FALSE.} +} +\value{ +A modified version of the input pedigree data.frame with MZ twin pairs fused for path tracing. If \code{test_df_twins} is TRUE, returns the data frame of identified twin pairs instead. +} +\description{ +Fuse MZ twin pairs in a pedigree dataset for path tracing +This function identifies MZ twin pairs in the pedigree dataset and merges their IDs for path tracing purposes. The second twin in each pair is made a founder (with NA parents), and all children of the second twin are redirected to the first twin. This allows for correct relatedness calculations without diagonal or downstream artifacts. +} diff --git a/man/isTwin.Rd b/man/isTwin.Rd new file mode 100644 index 00000000..b8b2689b --- /dev/null +++ b/man/isTwin.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helpTwins.R +\name{isTwin} +\alias{isTwin} +\title{Determine isTwin Status} +\usage{ +isTwin(ped) +} +\arguments{ +\item{ped}{pedigree data frame} +} +\value{ +A logical vector indicating, for each row of \code{ped}, whether + \code{twinID} is non-\code{NA}. +} +\description{ +Determine isTwin Status +} +\keyword{internal} diff --git a/man/makeInbreeding.Rd b/man/makeInbreeding.Rd index 1540b170..d43200c5 100644 --- a/man/makeInbreeding.Rd +++ b/man/makeInbreeding.Rd @@ -10,7 +10,8 @@ makeInbreeding( ID_mate2 = NA_integer_, verbose = FALSE, gen_inbred = 2, - type_inbred = "sib" + type_inbred = "sib", + prefer_unmated = FALSE ) } \arguments{ @@ -25,6 +26,8 @@ makeInbreeding( \item{gen_inbred}{A vector of \code{generation} of the twin to be imputed.} \item{type_inbred}{A character vector indicating the type of inbreeding. "sib" for sibling inbreeding and "cousin" for cousin inbreeding.} + +\item{prefer_unmated}{A logical indicating whether to prefer unmated siblings when automatically selecting inbred mates. Default is FALSE, which means the function will consider all siblings regardless of their mating status.} } \value{ Returns a \code{data.frame} with some inbred mates. diff --git a/man/makeTwins.Rd b/man/makeTwins.Rd index cb9733e5..dc4f3ee5 100644 --- a/man/makeTwins.Rd +++ b/man/makeTwins.Rd @@ -10,7 +10,8 @@ makeTwins( ID_twin2 = NA_integer_, gen_twin = 2, verbose = FALSE, - zygosity = "MZ" + zygosity = "MZ", + twin_sex = "R" ) } \arguments{ @@ -25,6 +26,8 @@ makeTwins( \item{verbose}{logical. If TRUE, print progress through stages of algorithm} \item{zygosity}{A character string indicating the zygosity of the twins. Default is "MZ" for monozygotic twins.} + +\item{twin_sex}{A character string indicating the sex of the twins. Default is randomly assigned ("R"). If specified, it should be either "M" or "F"} } \value{ Returns a \code{data.frame} with MZ twins information added as a new column. diff --git a/man/ped2add.Rd b/man/ped2add.Rd index dc2fee88..0a08cd09 100644 --- a/man/ped2add.Rd +++ b/man/ped2add.Rd @@ -21,6 +21,8 @@ ped2add( save_rate_parlist = 1e+05 * save_rate, save_path = "checkpoint/", compress = TRUE, + mz_twins = FALSE, + mz_method = "addtwins", ... ) } @@ -57,6 +59,10 @@ ped2add( \item{compress}{logical. If TRUE, use compression when saving the checkpoint files. Defaults to TRUE.} +\item{mz_twins}{logical. If TRUE, merge MZ co-twin columns in the r2 matrix before tcrossprod so that MZ twins are coded with relatedness 1 instead of 0.5. Twin pairs are identified from the \code{twinID} column. When a \code{zygosity} column is also present, only pairs where both members have \code{zygosity == "MZ"} are used; otherwise all \code{twinID} pairs are assumed to be MZ. Defaults to FALSE.} + +\item{mz_method}{character. The method to handle MZ twins. Options are "merging" (default) or "addtwins". "addtwins" adds the twin2 column to the twin1 column before tcrossprod so that all relatedness flows through a single source, then leaves the twin2 column as zero and relies on the fact that the row/col names are the same to copy the values back to twin2 after tcrossprod. "merging" merges the twin2 column into the twin1 column before tcrossprod and then copies the values back to twin2 after tcrossprod so that both twins appear in the final matrix.} + \item{...}{additional arguments to be passed to \code{\link{ped2com}}} } \description{ diff --git a/man/ped2com.Rd b/man/ped2com.Rd index f34d6022..dc57cdb7 100644 --- a/man/ped2com.Rd +++ b/man/ped2com.Rd @@ -15,7 +15,7 @@ ped2com( standardize_colnames = TRUE, transpose_method = "tcrossprod", adjacency_method = "direct", - isChild_method = "classic", + isChild_method = "partialparent", saveable = FALSE, resume = FALSE, save_rate = 5, @@ -25,6 +25,9 @@ ped2com( save_path = "checkpoint/", adjBeta_method = NULL, compress = TRUE, + mz_twins = TRUE, + mz_method = "addtwins", + beta = FALSE, ... ) } @@ -69,6 +72,12 @@ ped2com( \item{compress}{logical. If TRUE, use compression when saving the checkpoint files. Defaults to TRUE.} +\item{mz_twins}{logical. If TRUE, merge MZ co-twin columns in the r2 matrix before tcrossprod so that MZ twins are coded with relatedness 1 instead of 0.5. Twin pairs are identified from the \code{twinID} column. When a \code{zygosity} column is also present, only pairs where both members have \code{zygosity == "MZ"} are used; otherwise all \code{twinID} pairs are assumed to be MZ. Defaults to FALSE.} + +\item{mz_method}{character. The method to handle MZ twins. Options are "merging" (default) or "addtwins". "addtwins" adds the twin2 column to the twin1 column before tcrossprod so that all relatedness flows through a single source, then leaves the twin2 column as zero and relies on the fact that the row/col names are the same to copy the values back to twin2 after tcrossprod. "merging" merges the twin2 column into the twin1 column before tcrossprod and then copies the values back to twin2 after tcrossprod so that both twins appear in the final matrix.} + +\item{beta}{logical. Used for benchmarking} + \item{...}{additional arguments to be passed to \code{\link{ped2com}}} } \description{ diff --git a/man/ped2fam.Rd b/man/ped2fam.Rd index 3052e568..b3cf1e07 100644 --- a/man/ped2fam.Rd +++ b/man/ped2fam.Rd @@ -10,6 +10,7 @@ ped2fam( momID = "momID", dadID = "dadID", famID = "famID", + twinID = "twinID", ... ) } @@ -24,6 +25,8 @@ ped2fam( \item{famID}{character. Name of the column to be created in ped for the family ID variable} +\item{twinID}{character. Name of the column in ped for the twin ID variable, if applicable} + \item{...}{additional arguments to be passed to \code{\link{ped2com}}} } \value{ diff --git a/man/ped2graph.Rd b/man/ped2graph.Rd index 5e7ac7b1..022af3c7 100755 --- a/man/ped2graph.Rd +++ b/man/ped2graph.Rd @@ -9,6 +9,7 @@ ped2graph( personID = "ID", momID = "momID", dadID = "dadID", + twinID = "twinID", directed = TRUE, adjacent = c("parents", "mothers", "fathers"), ... @@ -23,6 +24,8 @@ ped2graph( \item{dadID}{character. Name of the column in ped for the father ID variable} +\item{twinID}{character. Name of the column in ped for the twin ID variable, if applicable} + \item{directed}{Logical scalar. Default is TRUE. Indicates whether or not to create a directed graph.} \item{adjacent}{Character. Relationship that defines adjacency in the graph: parents, mothers, or fathers} diff --git a/man/ped2maternal.Rd b/man/ped2maternal.Rd index 03e02311..1f3bcb2e 100755 --- a/man/ped2maternal.Rd +++ b/man/ped2maternal.Rd @@ -10,6 +10,7 @@ ped2maternal( momID = "momID", dadID = "dadID", matID = "matID", + twinID = "twinID", ... ) } @@ -24,6 +25,8 @@ ped2maternal( \item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} +\item{twinID}{character. Name of the column in ped for the twin ID variable, if applicable} + \item{...}{additional arguments to be passed to \code{\link{ped2com}}} } \description{ diff --git a/man/ped2paternal.Rd b/man/ped2paternal.Rd index e893ec03..7fac27b9 100755 --- a/man/ped2paternal.Rd +++ b/man/ped2paternal.Rd @@ -10,6 +10,7 @@ ped2paternal( momID = "momID", dadID = "dadID", patID = "patID", + twinID = "twinID", ... ) } @@ -24,6 +25,8 @@ ped2paternal( \item{patID}{Character. Paternal line ID variable to be created and added to the pedigree} +\item{twinID}{character. Name of the column in ped for the twin ID variable, if applicable} + \item{...}{additional arguments to be passed to \code{\link{ped2com}}} } \description{ diff --git a/man/simulatePedigree.Rd b/man/simulatePedigree.Rd index fa87632e..d56ab72c 100644 --- a/man/simulatePedigree.Rd +++ b/man/simulatePedigree.Rd @@ -76,7 +76,18 @@ current version.} \item{fam_shift}{An integer to shift the person ID. Default is 1L. This is useful when simulating multiple pedigrees to avoid ID conflicts.} -\item{beta}{logical. If TRUE, use the optimized version of the algorithm.} +\item{beta}{logical or character. Controls which algorithm version to use: +\itemize{ + \item{\code{FALSE}, \code{"base"}, or \code{"original"} (default): Use the original algorithm. + Slower but ensures exact reproducibility with set.seed().} + \item{\code{TRUE} or \code{"optimized"}: Use the optimized algorithm with 4-5x speedup. + Produces statistically equivalent results but not identical to base version + due to different random number consumption. Recommended for large simulations + where speed matters more than exact reproducibility.} +} +Note: Both versions are mathematically correct and produce valid pedigrees with the +same statistical properties (sex ratios, mating rates, etc.). The optimized version +uses vectorized operations instead of loops, making it much faster for large pedigrees.} \item{...}{Additional arguments to be passed to other functions.} } diff --git a/tests/testthat/test-buildComponent.R b/tests/testthat/test-buildComponent.R index b00fa039..585d7de3 100644 --- a/tests/testthat/test-buildComponent.R +++ b/tests/testthat/test-buildComponent.R @@ -1,3 +1,249 @@ +test_that("MZ twins coded at relatedness 1 via twinID column", { + # Simple pedigree: two parents and two MZ twin children + ped <- potter + + mz_method_opts <- c("addtwins", "merging") + + for (mz_method in mz_method_opts) { + # mz_method <- "merging" # "addtwins" + # Without mz_twins: siblings get 0.5 + r_no_mz <- ped2add(ped, mz_twins = FALSE, sparse = FALSE, mz_method = mz_method) + expect_equal(r_no_mz["12", "13"], 0.5) + expect_equal(r_no_mz["13", "12"], 0.5) + + # With mz_twins: MZ twins get 1.0 + r_mz <- ped2add(ped, mz_twins = TRUE, sparse = FALSE, mz_method = mz_method) + expect_equal(r_mz["12", "13"], 1.0) + expect_equal(r_mz["13", "12"], 1.0) + + # Self-relatedness should still be 1 + expect_equal(r_mz["12", "12"], 1.0) + expect_equal(r_mz["13", "13"], 1.0) + + # Parent-child relatedness unchanged + expect_equal(r_mz["12", "9"], 0.5) + expect_equal(r_mz["13", "9"], 0.5) + expect_equal(r_mz["12", "10"], 0.5) + expect_equal(r_mz["13", "10"], 0.5) + } + ped_kids <- potter + + # Add a child to one of the MZ twins + ped_kids <- addPersonToPed(ped_kids, sex = 0, momID = NA, dadID = NA, personID = 31) + ped_kids <- addPersonToPed(ped_kids, sex = 0, momID = NA, dadID = NA, personID = 32) + ped_kids <- addPersonToPed(ped_kids, sex = 0, momID = 31, dadID = 12, personID = 33) + ped_kids <- addPersonToPed(ped_kids, sex = 0, momID = 32, dadID = 13, personID = 34) + ped_kids <- addPersonToPed(ped_kids, sex = 0, momID = 31, dadID = 13, personID = 35) + + for (mz_method in mz_method_opts) { + r_kids <- ped2add(ped_kids, mz_twins = TRUE, sparse = FALSE, mz_method = mz_method) + # Child of twin1 (ID=31) should be 0.5 to twin1 (parent) + expect_equal(r_kids["33", "12"], 0.5) + # Child of twin1 should ALSO be 0.5 to twin2 (genetically identical to parent) + expect_equal(r_kids["33", "13"], 0.5) + # Child of twin2 (ID=32) should be 0.5 to twin + expect_equal(r_kids["34", "13"], 0.5) + # Child of twin2 should ALSO be 0.5 to twin1 (genetically identical to parent) + expect_equal(r_kids["34", "12"], 0.5) + + # different moms should be 0.25 with different mz twin dads + expect_equal(r_kids["34", "33"], 0.25) + expect_equal(r_kids["34", "35"], 0.25) + + # same mom, different mz twin dads should be 0.5 + expect_equal(r_kids["33", "35"], 0.5) + } + + r_mz1 <- ped2add(ped_kids, mz_twins = TRUE, sparse = FALSE, mz_method = mz_method_opts[1]) + + r_mz2 <- ped2add(ped_kids, mz_twins = TRUE, sparse = FALSE, mz_method = mz_method_opts[2]) + + expect_equal(r_mz1, r_mz2) +}) + + +test_that("MZ twins coded at relatedness 1 via twinID column (complex pedigree)", { + set.seed(1667) + Ngen <- 5 + kpc <- 4 + sexR <- .50 # sometimes fails above .5 + marR <- 1 + + gen_twin <- 3 # max(c(floor(Ngen / 2) - 1, 2)) + + # create base pedigree with twins at gen_twin + + df_midgen_base <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = TRUE) |> + makeTwins(gen_twin = gen_twin) + + twinIDS <- findMZtwins(df_midgen_base, + returnRows = FALSE, + returnIDs = TRUE, returnAsList = FALSE + ) + + moms_of_twins <- df_midgen_base$momID[df_midgen_base$ID %in% c(twinIDS$twin1_id, twinIDS$twin2_id)] %>% unique() + dads_of_twins <- df_midgen_base$dadID[df_midgen_base$ID %in% c(twinIDS$twin1_id, twinIDS$twin2_id)] %>% unique() + parents_of_twins <- unique(c(moms_of_twins, dads_of_twins)) + + grandmothers_of_twins <- df_midgen_base$momID[df_midgen_base$ID %in% parents_of_twins] %>% unique() + grandfathers_of_twins <- df_midgen_base$dadID[df_midgen_base$ID %in% parents_of_twins] %>% unique() + grandfathers_of_twins <- grandfathers_of_twins[!is.na(grandfathers_of_twins)] + grandmothers_of_twins <- grandmothers_of_twins[!is.na(grandmothers_of_twins)] + grandparents_of_twins <- unique(c(grandmothers_of_twins, grandfathers_of_twins)) + + female_children_of_twins <- df_midgen_base$ID[(df_midgen_base$momID %in% c(twinIDS$twin1_id, twinIDS$twin2_id) | df_midgen_base$dadID %in% c(twinIDS$twin1_id, twinIDS$twin2_id)) & df_midgen_base$sex == "F"] %>% unique() + + male_children_of_twins <- df_midgen_base$ID[(df_midgen_base$momID %in% c(twinIDS$twin1_id, twinIDS$twin2_id) | df_midgen_base$dadID %in% c(twinIDS$twin1_id, twinIDS$twin2_id)) & df_midgen_base$sex == "M"] %>% unique() + + children_of_twins <- c(male_children_of_twins, female_children_of_twins) + + df_midgen_below <- df_midgen_base |> makeInbreeding(ID_mate1 = male_children_of_twins[1], ID_mate2 = female_children_of_twins[length(female_children_of_twins)]) + + df_midgen_above <- df_midgen_base + + df_midgen_above$momID[df_midgen_above$ID %in% parents_of_twins] <- grandmothers_of_twins[1] + df_midgen_above$dadID[df_midgen_above$ID %in% parents_of_twins] <- grandfathers_of_twins[1] + + if (FALSE) { + df_midgen_below %>% + rename(personID = ID) %>% + ggpedigree::ggPedigreeInteractive(config = list( + code_male = "M", focal_fill_personID = twinIDS$twin1_id, + focal_fill_include = TRUE, + sex_color_include = FALSE + )) + } + for (df_midgen in list(df_midgen_base, df_midgen_below, df_midgen_above)) { + r_mz1 <- df_midgen |> + ped2add(mz_method = "merging", mz_twins = TRUE) + r_mz2 <- df_midgen |> + ped2add(mz_method = "addtwins", mz_twins = TRUE) + + r_mz3 <- df_midgen |> + ped2add(mz_twins = FALSE) + + # which rows are the twins + twin_rows <- which(!is.na(df_midgen$twinID)) + child_rows <- which(df_midgen$momID %in% df_midgen$ID[twin_rows] | df_midgen$dadID %in% df_midgen$ID[twin_rows]) + + family_rows <- unique(c(twin_rows, child_rows)) + + expect_equal(sum(as.matrix(r_mz1[family_rows, family_rows]) - as.matrix(r_mz2[family_rows, family_rows])), 0) + + expect_gt(sum(as.matrix(r_mz1[family_rows, family_rows])), sum(as.matrix(r_mz3[family_rows, family_rows]))) + + expect_gt(sum(as.matrix(r_mz2[family_rows, family_rows])), sum(as.matrix(r_mz3[family_rows, family_rows]))) + + + r_mz1_ordered <- r_mz1[order(rownames(r_mz1)), order(colnames(r_mz1))] + r_mz2_ordered <- r_mz2[order(rownames(r_mz2)), order(colnames(r_mz2))] + + expect_equal(sum(r_mz1_ordered - r_mz2_ordered), 0) + + expect_equal(length(r_mz1@i), length(r_mz2@i)) + expect_equal(length(r_mz1@x), length(r_mz2@x)) + expect_equal(length(r_mz1@p), length(r_mz2@p)) + + expect_equal(length(r_mz1@i), length(r_mz3@i)) + expect_equal(length(r_mz2@i), length(r_mz3@i)) + + expect_equal(length(r_mz1@x), length(r_mz3@x)) + expect_equal(length(r_mz2@x), length(r_mz3@x)) + + expect_equal(length(r_mz1@p), length(r_mz3@p)) + expect_equal(length(r_mz2@p), length(r_mz3@p)) + } +}) + +test_that("MZ twins coded at relatedness 1 via twinID column (minimal data.frame)", { + # Simple pedigree: two parents and two MZ twin children + ped <- data.frame( + ID = c(1, 2, 3, 4), + momID = c(NA, NA, 2, 2), + dadID = c(NA, NA, 1, 1), + sex = c("M", "F", "M", "M"), + twinID = c(NA, NA, 4, 3), + zygosity = c(NA, NA, "MZ", "MZ") + ) + + # Without mz_twins: siblings get 0.5 + r_no_mz <- ped2add(ped, mz_twins = FALSE, sparse = FALSE) + expect_equal(r_no_mz["3", "4"], 0.5) + expect_equal(r_no_mz["4", "3"], 0.5) + + # With mz_twins: MZ twins get 1.0 + r_mz <- ped2add(ped, mz_twins = TRUE, sparse = FALSE) + expect_equal(r_mz["3", "4"], 1.0) + expect_equal(r_mz["4", "3"], 1.0) + + # Self-relatedness should still be 1 + expect_equal(r_mz["3", "3"], 1.0) + expect_equal(r_mz["4", "4"], 1.0) + + # Parent-child relatedness unchanged + expect_equal(r_mz["3", "1"], 0.5) + expect_equal(r_mz["4", "1"], 0.5) + expect_equal(r_mz["3", "2"], 0.5) + expect_equal(r_mz["4", "2"], 0.5) +}) + +test_that("MZ twins without zygosity column assumes all twinID pairs are MZ", { + ped <- data.frame( + ID = c(1, 2, 3, 4), + momID = c(NA, NA, 2, 2), + dadID = c(NA, NA, 1, 1), + sex = c("M", "F", "M", "M"), + twinID = c(NA, NA, 4, 3) + ) + + r_mz <- ped2add(ped, mz_twins = TRUE, sparse = FALSE) + expect_equal(r_mz["3", "4"], 1.0) + expect_equal(r_mz["4", "3"], 1.0) +}) + +test_that("DZ twins with zygosity column are NOT modified", { + ped <- data.frame( + ID = c(1, 2, 3, 4), + momID = c(NA, NA, 2, 2), + dadID = c(NA, NA, 1, 1), + sex = c("M", "F", "M", "F"), + twinID = c(NA, NA, 4, 3), + zygosity = c(NA, NA, "DZ", "DZ") + ) + + r_mz <- ped2add(ped, mz_twins = TRUE, sparse = FALSE) + # DZ twins remain at sibling relatedness = 0.5 + expect_equal(r_mz["3", "4"], 0.5) + expect_equal(r_mz["4", "3"], 0.5) +}) + +test_that("MZ twins: downstream child relatedness is correct", { + # 3-generation pedigree: parents -> MZ twins -> twin2 has a child + ped <- data.frame( + ID = c(1, 2, 3, 4, 5, 6), + momID = c(NA, NA, 2, 2, NA, 4), + dadID = c(NA, NA, 1, 1, NA, 5), + sex = c("M", "F", "M", "M", "F", "M"), + twinID = c(NA, NA, 4, 3, NA, NA), + zygosity = c(NA, NA, "MZ", "MZ", NA, NA) + ) + + r_mz <- ped2add(ped, mz_twins = TRUE, sparse = FALSE) + + # MZ twins at 1.0 + expect_equal(r_mz["3", "4"], 1.0) + + # Child of twin2 (ID=4) should be 0.5 to twin2 (parent) + expect_equal(r_mz["6", "4"], 0.5) + + # Child of twin2 should ALSO be 0.5 to twin1 (genetically identical to parent) + expect_equal(r_mz["6", "3"], 0.5) + + # Diagonal for both twins should be clean (no inflation) + expect_equal(r_mz["3", "3"], 1.0) + expect_equal(r_mz["4", "4"], 1.0) +}) + test_that(".assignParentValue works", { expect_equal(.assignParentValue("generation"), .5) expect_equal(.assignParentValue("additive"), .5) diff --git a/tests/testthat/test-buildmxPedigrees.R b/tests/testthat/test-buildmxPedigrees.R new file mode 100644 index 00000000..ed1eb00e --- /dev/null +++ b/tests/testthat/test-buildmxPedigrees.R @@ -0,0 +1,312 @@ +# Tests for buildmxPedigrees.R +# All functions in this file require the OpenMx package. +# Tests are skipped automatically when OpenMx is not installed. + +# Helper: a minimal 2-person additive relatedness matrix (parent-child) +make_add2 <- function() matrix(c(1, 0.5, 0.5, 1), nrow = 2) + +# Helper: a 2-person observed data row +make_dat2 <- function(ytemp = c("y1", "y2")) { + matrix(c(1.5, 2.5), nrow = 1, dimnames = list(NULL, ytemp)) +} + +# ─── buildPedigreeModelCovariance ──────────────────────────────────────────── + +test_that("buildPedigreeModelCovariance returns an mxModel with default components", { + skip_if_not_installed("OpenMx") + vars <- list( + ad2 = 0.5, dd2 = 0.3, cn2 = 0.2, ce2 = 0.4, + mt2 = 0.1, am2 = 0.25, ee2 = 0.6 + ) + mod <- expect_no_error( + buildPedigreeModelCovariance(vars = vars) + ) + expect_true(inherits(mod, "MxModel")) + # Default flags: Vad, Vcn, Vce, Vmt, Ver are TRUE; Vdd and Vam are FALSE + expect_false(is.null(mod$Vad)) + expect_false(is.null(mod$Vcn)) + expect_false(is.null(mod$Vce)) + expect_false(is.null(mod$Vmt)) + expect_false(is.null(mod$Ver)) + expect_null(mod$Vdd) + expect_null(mod$Vam) +}) + +test_that("buildPedigreeModelCovariance includes dominance component when Vdd = TRUE", { + skip_if_not_installed("OpenMx") + vars <- list( + ad2 = 0.5, dd2 = 0.3, cn2 = 0.2, ce2 = 0.4, + mt2 = 0.1, am2 = 0.25, ee2 = 0.6 + ) + mod <- buildPedigreeModelCovariance(vars = vars, Vdd = TRUE) + expect_false(is.null(mod$Vdd)) +}) + +test_that("buildPedigreeModelCovariance includes A×mt interaction when Vam = TRUE", { + skip_if_not_installed("OpenMx") + vars <- list( + ad2 = 0.5, dd2 = 0.3, cn2 = 0.2, ce2 = 0.4, + mt2 = 0.1, am2 = 0.25, ee2 = 0.6 + ) + mod <- buildPedigreeModelCovariance(vars = vars, Vam = TRUE) + expect_false(is.null(mod$Vam)) +}) + +test_that("buildPedigreeModelCovariance works with all components enabled", { + skip_if_not_installed("OpenMx") + vars <- list( + ad2 = 0.5, dd2 = 0.3, cn2 = 0.2, ce2 = 0.4, + mt2 = 0.1, am2 = 0.25, ee2 = 0.6 + ) + mod <- expect_no_error( + buildPedigreeModelCovariance( + vars = vars, + Vad = TRUE, Vdd = TRUE, Vcn = TRUE, + Vce = TRUE, Vmt = TRUE, Vam = TRUE, Ver = TRUE + ) + ) + for (comp in c("Vad", "Vdd", "Vcn", "Vce", "Vmt", "Vam", "Ver")) { + expect_false(is.null(mod[[comp]]), + label = paste("Expected component", comp, "to be present in model") + ) + } +}) + +test_that("buildPedigreeModelCovariance works with minimal components (Vad + Ver only)", { + skip_if_not_installed("OpenMx") + vars <- list( + ad2 = 0.5, dd2 = 0.3, cn2 = 0.2, ce2 = 0.4, + mt2 = 0.1, am2 = 0.25, ee2 = 0.6 + ) + mod <- expect_no_error( + buildPedigreeModelCovariance( + vars = vars, + Vad = TRUE, Vdd = FALSE, Vcn = FALSE, + Vce = FALSE, Vmt = FALSE, Vam = FALSE, Ver = TRUE + ) + ) + expect_false(is.null(mod$Vad)) + expect_false(is.null(mod$Ver)) + expect_null(mod$Vcn) + expect_null(mod$Vce) +}) + +# ─── buildOneFamilyGroup ───────────────────────────────────────────────────── + +test_that("buildOneFamilyGroup errors when no relatedness matrix is provided", { + skip_if_not_installed("OpenMx") + dat <- make_dat2() + expect_error( + buildOneFamilyGroup( + group_name = "fam1", + Addmat = NULL, Nucmat = NULL, Extmat = NULL, + Mtdmat = NULL, Amimat = NULL, Dmgmat = NULL, + full_df_row = dat, + ytemp = c("y1", "y2") + ), + regexp = "At least one relatedness matrix must be provided" + ) +}) + +test_that("buildOneFamilyGroup returns an mxModel with an additive matrix", { + skip_if_not_installed("OpenMx") + Addmat <- make_add2() + dat <- make_dat2() + mod <- expect_no_error( + buildOneFamilyGroup( + group_name = "fam1", + Addmat = Addmat, + full_df_row = dat, + ytemp = c("y1", "y2") + ) + ) + expect_true(inherits(mod, "MxModel")) + expect_equal(mod$name, "fam1") + expect_false(is.null(mod$A)) +}) + +test_that("buildOneFamilyGroup returns an mxModel with nuclear family matrix", { + skip_if_not_installed("OpenMx") + Nucmat <- make_add2() + dat <- make_dat2() + mod <- expect_no_error( + buildOneFamilyGroup( + group_name = "fam2", + Nucmat = Nucmat, + full_df_row = dat, + ytemp = c("y1", "y2") + ) + ) + expect_true(inherits(mod, "MxModel")) + expect_false(is.null(mod$Cn)) +}) + +test_that("buildOneFamilyGroup determines family size from any provided matrix", { + skip_if_not_installed("OpenMx") + # Use Extmat to size the model + Extmat <- matrix(c(1, 1, 1, 1), nrow = 2) + dat <- make_dat2() + mod <- expect_no_error( + buildOneFamilyGroup( + group_name = "famExt", + Extmat = Extmat, + full_df_row = dat, + ytemp = c("y1", "y2") + ) + ) + # # Extmat signals "include Vce"; the algebra always uses U (unit matrix) + expect_false(is.null(mod$U)) +}) + +# ─── buildFamilyGroups ─────────────────────────────────────────────────────── + +test_that("buildFamilyGroups returns one group model per row of data", { + skip_if_not_installed("OpenMx") + Addmat <- make_add2() + # Two families, each with 2 observed variables + dat <- matrix(c(1.0, 2.0, 3.0, 4.0), + nrow = 2, + dimnames = list(NULL, c("y1", "y2")) + ) + groups <- expect_no_error( + buildFamilyGroups(dat = dat, ytemp = c("y1", "y2"), Addmat = Addmat) + ) + expect_true(is.list(groups)) + expect_equal(length(groups), nrow(dat)) +}) + +test_that("buildFamilyGroups names group models with supplied prefix", { + skip_if_not_installed("OpenMx") + Addmat <- make_add2() + dat <- matrix(c(1.0, 2.0), nrow = 1, dimnames = list(NULL, c("y1", "y2"))) + groups <- buildFamilyGroups( + dat = dat, ytemp = c("y1", "y2"), + Addmat = Addmat, prefix = "family" + ) + expect_equal(groups[[1]]$name, "family1") +}) + +test_that("buildFamilyGroups default prefix is 'fam'", { + skip_if_not_installed("OpenMx") + Addmat <- make_add2() + dat <- matrix(c(1.0, 2.0), nrow = 1, dimnames = list(NULL, c("y1", "y2"))) + groups <- buildFamilyGroups( + dat = dat, ytemp = c("y1", "y2"), Addmat = Addmat + ) + expect_equal(groups[[1]]$name, "fam1") +}) + +# ─── buildPedigreeMx ───────────────────────────────────────────────────────── + +test_that("buildPedigreeMx returns a multigroup mxModel", { + skip_if_not_installed("OpenMx") + vars <- list( + ad2 = 0.5, dd2 = 0.3, cn2 = 0.2, ce2 = 0.4, + mt2 = 0.1, am2 = 0.25, ee2 = 0.6 + ) + Addmat <- make_add2() + dat <- matrix(c(1.0, 2.0, 3.0, 4.0), + nrow = 2, + dimnames = list(NULL, c("y1", "y2")) + ) + group_models <- buildFamilyGroups( + dat = dat, ytemp = c("y1", "y2"), Addmat = Addmat + ) + mod <- expect_no_error( + buildPedigreeMx( + model_name = "TestPedigreeMx", + vars = vars, + group_models = group_models + ) + ) + expect_true(inherits(mod, "MxModel")) + expect_equal(mod$name, "TestPedigreeMx") +}) + +# ─── fitPedigreeModel ──────────────────────────────────────────────────────── + +test_that("fitPedigreeModel errors without OpenMx", { + # This test is meaningful only when OpenMx is absent; skip otherwise. + skip_if(requireNamespace("OpenMx", quietly = TRUE), + message = "OpenMx is installed; skipping no-OpenMx error test" + ) + expect_error( + fitPedigreeModel( + data = matrix(c(1, 2), nrow = 1, dimnames = list(NULL, c("y1", "y2"))) + ), + regexp = "OpenMx" + ) +}) + +test_that("fitPedigreeModel runs end-to-end with a trivial dataset", { + skip_if_not_installed("OpenMx") + set.seed(42) + # Two families, each with 2 (simulated) observed scores + dat <- matrix( + c(0.1, -0.1, 0.2, -0.2), + nrow = 2, + dimnames = list(NULL, c("y1", "y2")) + ) + Addmat <- make_add2() + group_models <- buildFamilyGroups( + dat = dat, ytemp = c("y1", "y2"), Addmat = Addmat + ) + vars <- list( + ad2 = 0.4, dd2 = 0.1, cn2 = 0.1, ce2 = 0.1, + mt2 = 0.05, am2 = 0.05, ee2 = 0.3 + ) + result <- expect_no_error( + fitPedigreeModel( + model_name = "FitTest", + vars = vars, + data = dat, + group_models = group_models, + tryhard = FALSE + ) + ) + expect_true(inherits(result, "MxModel")) +}) + +test_that("fitPedigreeModel generates group_models from data and relatedness matrices", { + skip_if_not_installed("OpenMx") + set.seed(42) + # Two families, each with 2 (simulated) observed scores + dat <- matrix( + c(0.1, -0.1, 0.2, -0.2), + nrow = 2, + dimnames = list(NULL, c("y1", "y2")) + ) + Addmat <- make_add2() + vars <- list( + ad2 = 0.4, dd2 = 0.1, cn2 = 0.1, ce2 = 0.1, + mt2 = 0.05, am2 = 0.05, ee2 = 0.3 + ) + result <- expect_no_error( + fitPedigreeModel( + model_name = "FitTestAutoGroup", + vars = vars, + data = dat, + group_models = NULL, # Will be auto-generated + Addmat = Addmat, + tryhard = FALSE + ) + ) + expect_true(inherits(result, "MxModel")) +}) + +test_that("fitPedigreeModel errors when group_models and data are both NULL", { + skip_if_not_installed("OpenMx") + vars <- list( + ad2 = 0.4, dd2 = 0.1, cn2 = 0.1, ce2 = 0.1, + mt2 = 0.05, am2 = 0.05, ee2 = 0.3 + ) + expect_error( + fitPedigreeModel( + model_name = "FitTest", + vars = vars, + data = NULL, + group_models = NULL + ), + regexp = "Either 'group_models' or 'data' must be provided" + ) +}) diff --git a/tests/testthat/test-helpTwins.R b/tests/testthat/test-helpTwins.R new file mode 100644 index 00000000..e19643bb --- /dev/null +++ b/tests/testthat/test-helpTwins.R @@ -0,0 +1,100 @@ +test_that("fuse twins behaves", { + # Simple pedigree: two parents and two MZ twin children + ped1 <- potter + ped1$ID <- ped1$personID + ped2 <- ped1 + ped2$famID <- 2 + + ped2$ID <- ped2$personID + 100 + ped2$momID <- ped2$momID + 100 + ped2$dadID <- ped2$dadID + 100 + ped2$twinID <- ped2$twinID + 100 + + ped <- rbind(ped1, ped2) + remove(ped2) + remove(ped1) + # returnRows = TRUE, + # returnIDs = FALSE, + # returnAsList = TRUE + + returnedRowsList <- findMZtwins(ped, returnRows = TRUE, returnIDs = FALSE, returnAsList = TRUE) + returnIDsList <- findMZtwins(ped, returnRows = FALSE, returnIDs = TRUE, returnAsList = TRUE) + returnedBothList <- findMZtwins(ped, returnRows = T, returnIDs = T, returnAsList = TRUE) + + # no error should be thrown when running fuseTwins with any of the above outputs as arguments + expect_no_error( + fuseTwins(ped, + test_df_twins = TRUE, + mz_id_pairs = NULL, + mz_row_pairs = NULL + ) + ) + + expect_no_error( + fuseTwins(ped, + test_df_twins = TRUE, + mz_id_pairs = NULL, + mz_row_pairs = returnedRowsList + ) + ) + expect_no_error( + fuseTwins(ped, + test_df_twins = TRUE, + mz_id_pairs = returnIDsList, + mz_row_pairs = NULL + ) + ) + expect_no_error( + fuseTwins(ped, + test_df_twins = TRUE, + mz_id_pairs = returnedBothList$pair_ids, + mz_row_pairs = returnedBothList$pair_rows + ) + ) + + + df_null <- tryCatch( + fuseTwins(ped, + test_df_twins = TRUE, + mz_id_pairs = NULL, + mz_row_pairs = NULL + ), + error = function(e) e + ) + + + df_returnedRows <- tryCatch( + fuseTwins(ped, + test_df_twins = TRUE, + mz_id_pairs = NULL, + mz_row_pairs = returnedRowsList + ), + error = function(e) e + ) + + + df_returnedIDs <- tryCatch( + fuseTwins(ped, + test_df_twins = TRUE, + mz_id_pairs = returnIDsList, + mz_row_pairs = NULL + ), + error = function(e) e + ) + + + df_returnedBoth <- tryCatch( + fuseTwins(ped, + test_df_twins = TRUE, + mz_id_pairs = returnedBothList$pair_ids, + mz_row_pairs = returnedBothList$pair_rows + ), + error = function(e) e + ) + + + expect_equal(df_returnedRows, df_returnedIDs) + expect_equal(df_returnedRows, df_returnedBoth) + expect_equal(df_returnedRows, df_null) + expect_equal(nrow(df_returnedRows), 2) # One pair of twins should returned +}) diff --git a/tests/testthat/test-segmentPedigree.R b/tests/testthat/test-segmentPedigree.R index c7224c6d..cf1cce02 100644 --- a/tests/testthat/test-segmentPedigree.R +++ b/tests/testthat/test-segmentPedigree.R @@ -1,9 +1,11 @@ test_that("ped2fam is smart about string ids", { data(hazard) + # ped2fam should work with numeric IDs and produce numeric IDs ds_num <- ped2fam(hazard, famID = "newFamID") expect_true(is.numeric(ds_num$ID)) expect_true(is.numeric(ds_num$newFamID)) hazard$ID_og <- hazard$ID + # ped2fam should work with string IDs and produce string IDs hazard$ID <- paste0("ID", hazard$ID) hazard$dadID <- paste0("ID", hazard$dadID) hazard$dadID[hazard$dadID == "IDNA"] <- NA @@ -47,7 +49,6 @@ test_that("ped2graph produces a graph for hazard data with mothers", { }) - test_that("ped2graph produces a graph for hazard data with fathers", { expect_silent(data(hazard)) g <- ped2graph(hazard, adjacent = "fathers") diff --git a/tests/testthat/test-simulatePedigree.R b/tests/testthat/test-simulatePedigree.R index 75100e4c..a13ff5d1 100644 --- a/tests/testthat/test-simulatePedigree.R +++ b/tests/testthat/test-simulatePedigree.R @@ -7,13 +7,23 @@ test_that("simulated pedigree generates expected data structure", { beta_options <- c(F, T) strict_tolerance <- 1e-8 sex_tolerance <- .035 + base_length <- 57 + base_length_tol <- 0.2 * base_length + beta_match_base <- FALSE # beta_options <- T for (beta in beta_options) { set.seed(seed) message("Beta option Starting: ", beta) results <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = beta) # Check that dimnames are correct - expect_equal(length(results$ID), 57, tolerance = strict_tolerance) + # Base version: exact count. Optimized version: within 20% range + if (isFALSE(beta) || (isTRUE(beta) && beta_match_base)) { + expect_equal(length(results$ID), base_length, tolerance = strict_tolerance) + } else { + expect_true(length(results$ID) >= base_length - base_length_tol && length(results$ID) <= base_length + base_length_tol, + info = paste0("Beta=TRUE: Expected 45-70 individuals, got ", length(results$ID)) + ) + } expect_equal(length(results), 7, tolerance = strict_tolerance) # check number of generations @@ -42,13 +52,23 @@ test_that("simulated pedigree generates expected data structure when sexR is imb beta_options <- c(F, T) strict_tolerance <- 1e-8 sex_tolerance <- .03 + base_length <- 154 + base_length_tol <- 0.2 * base_length + beta_match_base <- FALSE # beta_options <- T for (beta in beta_options) { set.seed(seed) message("Beta option Starting: ", beta) results <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = beta) # Check that dimnames are correct - expect_equal(length(results$ID), 154, tolerance = strict_tolerance) + # Base version: exact count. Optimized version: within 20% range + if (isFALSE(beta) || (isTRUE(beta) && beta_match_base)) { + expect_equal(length(results$ID), base_length, tolerance = strict_tolerance) + } else { + expect_true(length(results$ID) >= base_length - base_length_tol && length(results$ID) <= base_length + base_length_tol, + info = paste0("Beta=TRUE: Expected 123-185 individuals, got ", length(results$ID)) + ) + } expect_equal(length(results), 7, tolerance = strict_tolerance) # check number of generations @@ -79,6 +99,12 @@ test_that("simulated pedigree generates expected data structure when sexR is imb beta_options <- c(F, T) strict_tolerance <- 1e-8 sex_tolerance <- .03 + # Optimized version needs wider tolerance for sex ratios on large pedigrees + sex_tolerance_opt <- .07 + + base_length <- 424 + base_length_tol <- 0.2 * base_length + beta_match_base <- FALSE # beta_options <- T for (beta in beta_options) { @@ -86,12 +112,27 @@ test_that("simulated pedigree generates expected data structure when sexR is imb message("Beta option Starting: ", beta) results <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = beta) # Check that dimnames are correct - expect_equal(length(results$ID), 424, tolerance = strict_tolerance) + # Base version: exact count. Optimized version: within 20% range + if (isFALSE(beta) || (isTRUE(beta) && beta_match_base)) { + expect_equal(length(results$ID), base_length, tolerance = strict_tolerance) + } else { + expect_true(length(results$ID) >= base_length - base_length_tol && length(results$ID) <= base_length + base_length_tol, + info = paste0("Beta=TRUE: Expected 340-510 individuals, got ", length(results$ID)) + ) + } expect_equal(length(results), 7, tolerance = strict_tolerance) # check number of generations expect_equal(max(results$gen), Ngen, tolerance = strict_tolerance) + # expect there to be parents in each for all generations except the first one + filter_parents <- results %>% + group_by(gen) %>% + summarize(num_parents = sum(!is.na(dadID), na.rm = TRUE) + sum(!is.na(momID), na.rm = TRUE)) + + expect_true(all(filter_parents$num_parents[filter_parents$gen > 1] > 0), info = paste0("Beta option: ", beta)) + expect_true(all(filter_parents$num_parents[filter_parents$gen == 1] == 0), info = paste0("Beta option: ", beta)) + # check number of sex ratio sex_mean_male <- mean(results$sex == "M") @@ -99,8 +140,11 @@ test_that("simulated pedigree generates expected data structure when sexR is imb expect_lt(sex_mean_male, sex_mean_female) - expect_equal(sex_mean_male, sexR, tolerance = sex_tolerance, info = paste0("Beta option: ", beta)) - expect_equal(sex_mean_female, 1 - sexR, tolerance = sex_tolerance, info = paste0("Beta option: ", beta)) + # Use wider tolerance for optimized version + tol <- if (isFALSE(beta)) sex_tolerance else sex_tolerance_opt + + expect_equal(sex_mean_male, sexR, tolerance = tol, info = paste0("Beta option: ", beta)) + expect_equal(sex_mean_female, 1 - sexR, tolerance = tol, info = paste0("Beta option: ", beta)) message("Beta option Ending: ", beta) } }) @@ -117,7 +161,11 @@ test_that("simulated pedigree generates expected data structure but supply var n beta_options <- c(F, T) strict_tolerance <- 1e-8 sex_tolerance <- .03 + sex_tolerance_opt <- .07 # beta_options <- T + base_length <- 57 + base_length_tol <- 0.2 * base_length + beta_match_base <- FALSE for (beta in beta_options) { set.seed(seed) @@ -129,7 +177,14 @@ test_that("simulated pedigree generates expected data structure but supply var n beta = beta ) # Check that dimnames are correct - expect_equal(length(results$Id), 57, tolerance = strict_tolerance) + # Base version: exact count. Optimized version: within 20% range + if (isFALSE(beta) || (isTRUE(beta) && beta_match_base)) { + expect_equal(length(results$Id), base_length, tolerance = strict_tolerance) + } else { + expect_true(length(results$Id) >= base_length - base_length_tol && length(results$Id) <= base_length + base_length_tol, + info = paste0("Beta=TRUE: Expected 45-70 individuals, got ", length(results$Id)) + ) + } expect_equal(length(results), 7, tolerance = strict_tolerance) # check number of generations @@ -143,9 +198,19 @@ test_that("simulated pedigree generates expected data structure but supply var n expect_lt(sex_mean_male, sex_mean_female) + # expect there to be parents in each for all generations except the first one + filter_parents <- results %>% + group_by(gen) %>% + summarize(num_parents = sum(!is.na(dadID), na.rm = TRUE) + sum(!is.na(momID), na.rm = TRUE)) - expect_equal(sex_mean_male, sexR, tolerance = sex_tolerance, info = paste0("Beta option: ", beta)) - expect_equal(sex_mean_female, 1 - sexR, tolerance = sex_tolerance, info = paste0("Beta option: ", beta)) + expect_true(all(filter_parents$num_parents[filter_parents$gen > 1] > 0), info = paste0("Beta option: ", beta)) + expect_true(all(filter_parents$num_parents[filter_parents$gen == 1] == 0), info = paste0("Beta option: ", beta)) + + + # Use wider tolerance for optimized version + tol <- if (isFALSE(beta)) sex_tolerance else sex_tolerance_opt + expect_equal(sex_mean_male, sexR, tolerance = tol, info = paste0("Beta option: ", beta)) + expect_equal(sex_mean_female, 1 - sexR, tolerance = tol, info = paste0("Beta option: ", beta)) message("Beta option Ending: ", beta) } }) @@ -157,6 +222,7 @@ test_that("simulatePedigree verbose prints updates", { sexR <- .50 marR <- .7 beta_options <- c(F, T) + # beta_options <- T for (beta in beta_options) { set.seed(seed) diff --git a/tests/testthat/test-tweakPedigree.R b/tests/testthat/test-tweakPedigree.R index b0e63987..e84cb273 100644 --- a/tests/testthat/test-tweakPedigree.R +++ b/tests/testthat/test-tweakPedigree.R @@ -163,17 +163,25 @@ test_that("makeInbreeding - Inbred mates specified by generation and sibling", { marR <- .7 gen_inbred <- 2 type_inbred <- "sibling" + prefer_unmated <- c(TRUE, FALSE) + ped <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) # - result <- makeInbreeding(ped, gen_inbred = gen_inbred, type_inbred = type_inbred) - expect_equal(names(result), c("famID", "ID", "gen", "dadID", "momID", "spID", "sex")) - - # do we have the same people? - expect_equal(result$ID, ped$ID) - - # did we get more spID values than we started with? - expect_gt(sum(!is.na(result$spID)), sum(!is.na(ped$spID))) + for (prefer in prefer_unmated) { + result <- makeInbreeding(ped, + gen_inbred = gen_inbred, type_inbred = type_inbred, + prefer_unmated = prefer, + verbose = TRUE + ) + expect_equal(names(result), c("famID", "ID", "gen", "dadID", "momID", "spID", "sex")) + + # do we have the same people? + expect_equal(result$ID, ped$ID) + + # did we get more spID values than we started with? + expect_gt(sum(!is.na(result$spID)), sum(!is.na(ped$spID))) + } }) test_that("makeInbreeding - Inbred mates specified by generation and cousin", { @@ -221,6 +229,9 @@ test_that("dropLink - Drop specified by ID", { # are the dataframes the same in both the undropped and dropepd relationships for all but the dropped ID? + expect_equal(colnames(result), c("famID", "ID", "gen", "dadID", "momID", "spID", "sex")) + names(ped) <- c("famID", "ID", "gen", "dadID", "momID", "spID", "sex") + expect_equal(result[result$ID != ID_drop, ], ped[ped$ID != ID_drop, ]) # are the families of the dropped ID in the original? @@ -242,6 +253,9 @@ test_that("dropLink - Drop specified by ID", { ped <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) result <- dropLink(ped, ID_drop = ID_drop) + expect_equal(colnames(result), c("famID", "ID", "gen", "dadID", "momID", "spID", "sex")) + names(ped) <- c("famID", "ID", "gen", "dadID", "momID", "spID", "sex") + # are the dataframes the same in both the undropped and dropped relationships for all but the dropped ID? expect_equal(result[result$ID != ID_drop, ], ped[ped$ID != ID_drop, ]) @@ -262,7 +276,8 @@ test_that("dropLink - Drop specified by generation", { ped <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) result <- dropLink(ped, gen_drop = gen_drop) - + expect_equal(colnames(result), c("famID", "ID", "gen", "dadID", "momID", "spID", "sex")) + names(ped) <- c("famID", "ID", "gen", "dadID", "momID", "spID", "sex") # are the dataframes the same in both the undropped and dropped relationships for all but the dropped gen? expect_equal(result[result$gen != gen_drop, ], ped[ped$gen != gen_drop, ]) @@ -399,3 +414,328 @@ test_that("addPersonToPed works as expected with zygosity", { expect_equal(updated3$personID[4], max(ped$personID, na.rm = TRUE) + 2) }) + +# Tests for single-ID specification (auto-find the other) + +test_that("makeTwins - specify only ID_twin1, auto-find twin2", { + ped <- data.frame( + famID = c(1, 1, 1, 1), + ID = c(1, 2, 3, 4), + gen = c(1, 1, 2, 2), + dadID = c(NA, NA, 1, 1), + momID = c(NA, NA, 2, 2), + spID = c(NA, NA, NA, NA), + sex = c("M", "F", "M", "F") + ) + # Person 3 (M) and 4 (F) are siblings. With DZ zygosity, either could be auto-selected. + result <- makeTwins(ped, ID_twin1 = 3, zygosity = "DZ") + expect_equal(sum(!is.na(result$twinID)), 2) + # Twin1 should be person 3 + expect_equal(result$twinID[result$ID == 3], 4) + expect_equal(result$twinID[result$ID == 4], 3) +}) + +test_that("makeTwins - specify only ID_twin2, auto-find twin1", { + ped <- data.frame( + famID = c(1, 1, 1, 1), + ID = c(1, 2, 3, 4), + gen = c(1, 1, 2, 2), + dadID = c(NA, NA, 1, 1), + momID = c(NA, NA, 2, 2), + spID = c(NA, NA, NA, NA), + sex = c("M", "F", "M", "F") + ) + result <- makeTwins(ped, ID_twin2 = 4, zygosity = "DZ") + expect_equal(sum(!is.na(result$twinID)), 2) + expect_equal(result$twinID[result$ID == 4], 3) + expect_equal(result$twinID[result$ID == 3], 4) +}) + +test_that("makeInbreeding - specify only ID_mate1, auto-find mate2", { + ped <- data.frame( + famID = c(1, 1, 1, 1), + ID = c(1, 2, 3, 4), + gen = c(1, 1, 2, 2), + dadID = c(NA, NA, 1, 1), + momID = c(NA, NA, 2, 2), + spID = c(NA, NA, NA, NA), + sex = c("M", "F", "M", "F") + ) + # Person 3 (M) should auto-find person 4 (F) as opposite-sex sibling + result <- makeInbreeding(ped, ID_mate1 = 3) + expect_equal(result$spID[result$ID == 3], 4) + expect_equal(result$spID[result$ID == 4], 3) +}) + +# ─── makeTwins edge cases ──────────────────────────────────────────────────── + +test_that("makeTwins - invalid gen_twin below 2 issues warning and returns unchanged ped", { + set.seed(1) + ped <- simulatePedigree(kpc = 4, Ngen = 4, sexR = .5, marR = .7) + # gen_twin = 1 is invalid (< 2) + expect_warning( + result <- makeTwins(ped, gen_twin = 1), + regexp = "generation of the twins" + ) + # The returned pedigree should not have twinID or zygosity columns + expect_false("twinID" %in% colnames(result)) + expect_false("zygosity" %in% colnames(result)) + # Row count unchanged + expect_equal(nrow(result), nrow(ped)) +}) + +test_that("makeTwins - invalid gen_twin above max generation issues warning", { + set.seed(1) + ped <- simulatePedigree(kpc = 4, Ngen = 4, sexR = .5, marR = .7) + max_gen <- max(ped$gen) + expect_warning( + result <- makeTwins(ped, gen_twin = max_gen + 1), + regexp = "generation of the twins" + ) + expect_false("twinID" %in% colnames(result)) +}) + +test_that("makeTwins - verbose prints twin IDs when both specified", { + ped <- data.frame( + famID = c(1, 1, 1, 1), + ID = c(1, 2, 3, 4), + gen = c(1, 1, 2, 2), + dadID = c(NA, NA, 1, 1), + momID = c(NA, NA, 2, 2), + spID = c(NA, NA, NA, NA), + sex = c("M", "F", "M", "F") + ) + # verbose = TRUE should not error + expect_no_error(makeTwins(ped, ID_twin1 = 3, ID_twin2 = 4, verbose = TRUE)) +}) + +test_that("makeTwins - twinID column is updated in-place when it already exists", { + set.seed(2) + ped <- simulatePedigree(kpc = 4, Ngen = 3, sexR = .5, marR = .7) + # Create first pair of twins (adds twinID column) + ped_t1 <- makeTwins(ped, gen_twin = 2) + expect_true("twinID" %in% colnames(ped_t1)) + # Second call should reuse the existing twinID column (not create a new MZtwin column) + ped_t2 <- expect_no_error(makeTwins(ped_t1, gen_twin = 2)) + expect_true("twinID" %in% colnames(ped_t2)) + expect_false("MZtwin" %in% colnames(ped_t2)) + # At minimum the original twin pair is still recorded + expect_gte(sum(!is.na(ped_t2$twinID)), 2) +}) + +# ─── makeInbreeding – auto-find mate1 when only ID_mate2 provided ──────────── + +test_that("makeInbreeding - specify only ID_mate2, auto-find mate1", { + ped <- data.frame( + famID = c(1, 1, 1, 1), + ID = c(1, 2, 3, 4), + gen = c(1, 1, 2, 2), + dadID = c(NA, NA, 1, 1), + momID = c(NA, NA, 2, 2), + spID = c(NA, NA, NA, NA), + sex = c("M", "F", "M", "F") + ) + # Person 4 (F) specified; person 3 (M) should be auto-selected as opposite-sex sibling + result <- makeInbreeding(ped, ID_mate2 = 4) + expect_equal(result$spID[result$ID == 3], 4) + expect_equal(result$spID[result$ID == 4], 3) +}) + +test_that("makeInbreeding - prefer_unmated=TRUE with single ID_mate1 runs without error", { + set.seed(42) + ped <- simulatePedigree(kpc = 4, Ngen = 4, sexR = .5, marR = .7) + # Pick a generation-2 individual with an opposite-sex sibling + gen2_ids <- ped$ID[ped$gen == 2 & !is.na(ped$dadID)] + for (cand in gen2_ids) { + cand_sex <- ped$sex[ped$ID == cand] + cand_dad <- ped$dadID[ped$ID == cand] + cand_mom <- ped$momID[ped$ID == cand] + opp_pool <- ped$ID[ + ped$ID != cand & ped$gen == 2 & + !is.na(ped$dadID) & ped$dadID == cand_dad & + !is.na(ped$momID) & ped$momID == cand_mom & + ped$sex != cand_sex + ] + if (length(opp_pool) > 0) { + result <- expect_no_error( + makeInbreeding(ped, ID_mate1 = cand, prefer_unmated = TRUE) + ) + # The candidate's spID should have been set to one of the eligible siblings + selected_mate <- result$spID[result$ID == cand] + expect_true(!is.na(selected_mate)) + expect_true(selected_mate %in% opp_pool) + # The relationship should be symmetric + expect_equal(result$spID[result$ID == selected_mate], cand) + break + } + } +}) + +test_that("makeInbreeding - prefer_unmated=TRUE with only ID_mate2", { + ped <- data.frame( + famID = c(1, 1, 1, 1), + ID = c(1, 2, 3, 4), + gen = c(1, 1, 2, 2), + dadID = c(NA, NA, 1, 1), + momID = c(NA, NA, 2, 2), + spID = c(NA, NA, NA, NA), + sex = c("M", "F", "M", "F") + ) + result <- expect_no_error( + makeInbreeding(ped, ID_mate2 = 4, prefer_unmated = TRUE) + ) + expect_equal(result$spID[result$ID == 4], 3) + expect_equal(result$spID[result$ID == 3], 4) +}) + +# ─── dropLink – sex_drop filter ────────────────────────────────────────────── + +test_that("dropLink - drop only males in a generation via sex_drop", { + set.seed(15) + ped <- simulatePedigree(kpc = 4, Ngen = 4, sexR = .5, marR = .7) + names(ped)[names(ped) == "fam"] <- "famID" + + result <- dropLink(ped, gen_drop = 2, sex_drop = "M") + + # Some male in gen 2 should now have NA parents + males_gen2 <- result[result$gen == 2 & result$sex == "M", ] + expect_true(any(is.na(males_gen2$dadID) | is.na(males_gen2$momID))) + + # Females in gen 2 should be completely unchanged + females_gen2_orig <- ped[ped$gen == 2 & ped$sex == "F", ] + females_gen2_res <- result[result$gen == 2 & result$sex == "F", ] + expect_equal(females_gen2_res$dadID, females_gen2_orig$dadID) + expect_equal(females_gen2_res$momID, females_gen2_orig$momID) +}) + +test_that("dropLink - warning when target pool is empty", { + # Generation 1 founders have no dadID/momID, so the pool is always empty + set.seed(15) + ped <- simulatePedigree(kpc = 4, Ngen = 4, sexR = .5, marR = .7) + expect_warning( + result <- dropLink(ped, gen_drop = 1), + regexp = "No individual is dropped" + ) + # Pedigree should be returned unchanged + expect_equal(nrow(result), nrow(ped)) +}) + +# ─── addPersonToPed – additional paths ─────────────────────────────────────── + +test_that("addPersonToPed - error when overwrite=TRUE and personID does not exist", { + ped <- data.frame( + personID = c(1L, 2L), + name = c("Alice", "Bob"), + sex = c("F", "M"), + momID = c(NA, NA), + dadID = c(NA, NA), + twinID = c(NA_integer_, NA_integer_), + stringsAsFactors = FALSE + ) + expect_error( + addPersonToPed(ped, personID = 99, overwrite = TRUE), + regexp = "does not exist in the pedigree" + ) +}) + +test_that("addPersonToPed - notes column is handled when present in ped", { + ped <- data.frame( + personID = c(1L, 2L), + name = c("Alice", "Bob"), + sex = c("F", "M"), + momID = c(NA, NA), + dadID = c(NA, NA), + twinID = c(NA_integer_, NA_integer_), + notes = c(NA_character_, NA_character_), + stringsAsFactors = FALSE + ) + updated <- addPersonToPed(ped, + name = "Charlie", sex = "M", + momID = 1, dadID = 2, + notes = "test note", personID = 10 + ) + expect_equal(nrow(updated), 3) + expect_equal(updated$notes[3], "test note") + + # When notes not supplied it should be NA + updated2 <- addPersonToPed(ped, name = "Dana", sex = "F") + expect_true(is.na(updated2$notes[3])) +}) + +test_that("addPersonToPed - non-data.frame input raises error", { + expect_error( + addPersonToPed(list(personID = 1), personID = 2) + # stopifnot(is.data.frame(ped)) fires for non-data.frame input + ) +}) + +# ─── makePool ──────────────────────────────────────────────────────────────── + +test_that("makePool returns opposite-sex siblings with shared parents", { + ped <- data.frame( + famID = c(1, 1, 1, 1), + ID = c(1, 2, 3, 4), + gen = c(1, 1, 2, 2), + dadID = c(NA, NA, 1, 1), + momID = c(NA, NA, 2, 2), + spID = c(NA, NA, NA, NA), + sex = c("M", "F", "M", "F") + ) + # Person 3 is male; person 4 should be the pool + pool <- BGmisc:::makePool( + ped = ped, + mate_id = 3, + mate_sex = "M", + mate_dad = 1, + mate_mom = 2, + prefer_unmated = FALSE + ) + expect_equal(pool, 4) +}) + +test_that("makePool with prefer_unmated=FALSE returns all qualifying siblings", { + ped <- data.frame( + famID = c(1, 1, 1, 1, 1), + ID = c(1, 2, 3, 4, 5), + gen = c(1, 1, 2, 2, 2), + dadID = c(NA, NA, 1, 1, 1), + momID = c(NA, NA, 2, 2, 2), + spID = c(NA, NA, NA, 5, NA), # person 4 is mated, person 6 is not + sex = c("M", "F", "M", "F", "F") + ) + # Both female siblings (4 and 5) should appear with prefer_unmated=FALSE + pool <- BGmisc:::makePool( + ped = ped, + mate_id = 3, + mate_sex = "M", + mate_dad = 1, + mate_mom = 2, + prefer_unmated = FALSE + ) + expect_true(4 %in% pool) + expect_true(5 %in% pool) +}) + +test_that("makePool with gen_inbred filters by generation", { + ped <- data.frame( + famID = c(1, 1, 1, 1, 1, 1), + ID = c(1, 2, 3, 4, 5, 6), + gen = c(1, 1, 2, 2, 3, 3), + dadID = c(NA, NA, 1, 1, NA, NA), + momID = c(NA, NA, 2, 2, NA, NA), + spID = c(NA, NA, NA, NA, NA, NA), + sex = c("M", "F", "M", "F", "M", "F") + ) + # With gen_inbred=2 the pool should be restricted to gen 2 + pool <- BGmisc:::makePool( + ped = ped, + mate_id = 3, + mate_sex = "M", + mate_dad = 1, + mate_mom = 2, + prefer_unmated = FALSE, + gen_inbred = 2 + ) + # Only person 4 is in gen 2, opposite sex, same parents + expect_equal(pool, 4) +}) diff --git a/vignettes/.gitignore b/vignettes/.gitignore index edf6fea2..43fc0f18 100644 --- a/vignettes/.gitignore +++ b/vignettes/.gitignore @@ -1,2 +1,3 @@ *.R +*.html diff --git a/vignettes/v0_network.Rmd b/vignettes/v0_network.Rmd index 2d78d92b..958e5db0 100644 --- a/vignettes/v0_network.Rmd +++ b/vignettes/v0_network.Rmd @@ -2,7 +2,7 @@ title: "Network tools for finding extended pedigrees and path tracing" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Network} + %\VignetteIndexEntry{Network tools for finding extended pedigrees and path tracing} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- diff --git a/vignettes/v0_network.html b/vignettes/v0_network.html deleted file mode 100644 index 6c622f70..00000000 --- a/vignettes/v0_network.html +++ /dev/null @@ -1,552 +0,0 @@ - - - - - - - - - - - - - - -Network tools for finding extended pedigrees and path tracing - - - - - - - - - - - - - - - - - - - - - - - - - - -

Network tools for finding extended -pedigrees and path tracing

- - - -
-

Introduction

-

This vignette showcases two key features that capitalize on the -network structure inherent in pedigrees:

-
    -
  1. Finding extended families with any connecting -relationships between members. This feature strictly uses a person’s ID, -mother’s ID, and father’s ID to find out which people in a dataset are -remotely related by any path, effectively finding all separable extended -families in a dataset.

  2. -
  3. Using path tracing rules to quantify the amount of -relatedness between all pairs of individuals in a dataset. The amount of -relatedness can be characterized by additive nuclear DNA, shared -mitochondrial DNA, sharing both parents, or being part of the same -extended pedigree.

  4. -
-
-

Loading Required Libraries and Data

-
library(BGmisc)
-data(potter)
-
-
-
-

Finding Extended Families

-

Many pedigree datasets only contain information on the person, their -mother, and their father, often without nuclear or extended family IDs. -Recognizing which sets of people are unrelated simplifies many -pedigree-related tasks. This function facilitates those tasks by finding -all the extended families. People within the same extended family have -at least some form of relation, however distant, while those in -different extended families have no relations.

-
-Potter Family Pedigree -

-Potter Family Pedigree -

-
-

We will use the potter pedigree data as an example. For -convenience, we’ve renamed the family ID variable to oldfam -to avoid confusion with the new family ID variable we will create.

-
df_potter <- potter
-names(df_potter)[names(df_potter) == "famID"] <- "oldfam"
-
-ds <- ped2fam(df_potter, famID = "famID", personID = "personID")
-
-table(ds$famID, ds$oldfam)
-#>    
-#>      1
-#>   1 36
-

Because the potter data already had a family ID -variable, we compare our newly created variable to the pre-existing one. -They match!

-
-
-

Computing Relatedness

-

Once you know which sets of people are related at all to one another, -you’ll likely want to know how much. For additive genetic relatedness, -you can use the ped2add() function.

-
add <- ped2add(potter, sparse = FALSE)
-

This computes the additive genetic relatedness for everyone in the -data. It returns a square, symmetric matrix that has as many rows and -columns as there are IDs.

-
add[1:7, 1:7]
-#>     1    2    3    4   5     6     7
-#> 1 1.0 0.50 0.00 0.00 0.0 0.500 0.000
-#> 2 0.5 1.00 0.00 0.00 0.0 0.250 0.000
-#> 3 0.0 0.00 1.00 0.50 0.0 0.500 0.250
-#> 4 0.0 0.00 0.50 1.00 0.0 0.250 0.500
-#> 5 0.0 0.00 0.00 0.00 1.0 0.000 0.500
-#> 6 0.5 0.25 0.50 0.25 0.0 1.000 0.125
-#> 7 0.0 0.00 0.25 0.50 0.5 0.125 1.000
-

The entry in the ith row and the jth column gives the relatedness -between person i and person j. For example, person 1 (Vernon Dursley) -shares 0.5 of their nuclear DNA with person 6 (Dudley Dursley), shares -0.5 of their nuclear DNA with person 2 (Marjorie Dursley).

-
table(add)
-#> add
-#>      0 0.0625  0.125   0.25    0.5      1 
-#>    788      6     94    208    164     36
-

It’s probably fine to do this on the whole dataset when your data -have fewer than 10,000 people. When the data get large, however, it’s -much more efficient to compute this relatedness separately for each -extended family.

-
add_list <- lapply(
-  unique(potter$famID),
-  function(d) {
-    tmp <- potter[potter$famID %in% d, ]
-    ped2add(tmp, sparse = FALSE)
-  }
-)
-
-

Other relatedness measures

-

The function works similarly for mitochondrial -(ped2mit), common nuclear environment through sharing both -parents (ped2cn), and common extended family environment -(ped2ce).

-
-

Computing mitochondrial relatedness

-

Here we calculate the mitochondrial relatedness between all pairs of -individuals in the potter dataset.

-
mit <- ped2mit(potter, sparse = FALSE)
-mit[1:7, 1:7]
-#>   1 2 3 4 5 6 7
-#> 1 1 1 0 0 0 0 0
-#> 2 1 1 0 0 0 0 0
-#> 3 0 0 1 1 0 1 1
-#> 4 0 0 1 1 0 1 1
-#> 5 0 0 0 0 1 0 0
-#> 6 0 0 1 1 0 1 1
-#> 7 0 0 1 1 0 1 1
-table(mit)
-#> mit
-#>    0    1 
-#> 1082  214
-

As you can see, some of the family members share mitochondrial DNA, -such as person 2 and person 3 0, whereas person 1 and person 3 do -not.

-
-
-

Computing relatedness through common nuclear environment

-

Here we calculate the relatedness between all pairs of individuals in -the potter dataset through sharing both parents.

-
commonNuclear <- ped2cn(potter, sparse = FALSE)
-commonNuclear[1:7, 1:7]
-#>   1 2 3 4 5 6 7
-#> 1 1 1 0 0 0 0 0
-#> 2 1 1 0 0 0 0 0
-#> 3 0 0 1 1 0 0 0
-#> 4 0 0 1 1 0 0 0
-#> 5 0 0 0 0 1 0 0
-#> 6 0 0 0 0 0 1 0
-#> 7 0 0 0 0 0 0 1
-
-table(commonNuclear)
-#> commonNuclear
-#>    0    1 
-#> 1196  100
-
-
-

Computing relatedness through common extended family -environment

-

Here we calculate the relatedness between all pairs of individuals in -the potter dataset through sharing an extended family.

-
extendedFamilyEnvironment <- ped2ce(potter, sparse = FALSE)
-extendedFamilyEnvironment[1:7, 1:7]
-#>      [,1] [,2] [,3] [,4] [,5] [,6] [,7]
-#> [1,]    1    1    1    1    1    1    1
-#> [2,]    1    1    1    1    1    1    1
-#> [3,]    1    1    1    1    1    1    1
-#> [4,]    1    1    1    1    1    1    1
-#> [5,]    1    1    1    1    1    1    1
-#> [6,]    1    1    1    1    1    1    1
-#> [7,]    1    1    1    1    1    1    1
-table(extendedFamilyEnvironment)
-#> extendedFamilyEnvironment
-#>    1 
-#> 1296
-
-
-
-
-

Subsetting Pedigrees

-

Subsetting a pedigree allows researchers to focus on specific family -lines or individuals within a larger dataset. This can be particularly -useful for data validation as well as simplifying complex pedigrees for -visualization. However, subsetting a pedigree can result in the -underestimation of relatedness between individuals. This is because the -subsetted pedigree may not contain all the individuals that connect two -people together. For example, if we were to remove Arthur Weasley -(person 9) and Molly Prewett (person 10) from the potter -dataset, we would lose the connections amongst their children.

-
-Potter Subset Pedigree -

-Potter Subset Pedigree -

-
-

In the plot above, we have removed Arthur Weasley (person 9) and -Molly Prewett (person 10) from the potter dataset. As a -result, the connections between their children are lost.

-

Similarly, if we remove the children of Vernon Dursley (1) and -Petunia Evans (3) from the potter dataset, we would lose -the connections between the two individuals.

-

However, this subset does not plot the relationship between spouses -(such as the marriage between Vernon Dursley and Petunia Evans), as -there are not children to connect the two individuals together yet.

-
subset_rows <- c(1:5, 31:36)
-subset_potter <- potter[subset_rows, ]
-

-
- - - - - - - - - - - diff --git a/vignettes/v1_modelingvariancecomponents.Rmd b/vignettes/v1_modelingvariancecomponents.Rmd index 0cf74977..5e6ac18e 100644 --- a/vignettes/v1_modelingvariancecomponents.Rmd +++ b/vignettes/v1_modelingvariancecomponents.Rmd @@ -2,7 +2,7 @@ title: "Modeling variance components" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{modelingvariancecomponents} + %\VignetteIndexEntry{Modeling variance components} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -187,3 +187,19 @@ if (!requireNamespace("EasyMx", quietly = TRUE)) { summary(run2) } ``` + +## Extended Pedigrees + +The variance component framework used above for twin data extends naturally to +more complex family structures, such as multi-generational pedigrees. In this +setting, BGmisc can be used to derive the appropriate relatedness coefficients +(e.g., additive genetic, shared environmental, and dominance relationships) +from a pedigree object, and these coefficients can then be supplied to a +structural equation modeling package (such as EasyMx/OpenMx) to fit the model. + +In practice, the workflow mirrors the twin example: (1) prepare a pedigree +dataset with individual IDs and parental links, (2) use BGmisc functions to +compute the relevant relatedness matrices for the pedigree, (3) specify a +variance component model (e.g., ACE, ADE, or custom structures) that uses +these matrices, and (4) estimate the model parameters via maximum likelihood. +Please see the vignette on "Modeling with pedigrees" for a detailed example of this process. diff --git a/vignettes/v1_modelingvariancecomponents.html b/vignettes/v1_modelingvariancecomponents.html deleted file mode 100644 index ac2a09f6..00000000 --- a/vignettes/v1_modelingvariancecomponents.html +++ /dev/null @@ -1,543 +0,0 @@ - - - - - - - - - - - - - - -Modeling variance components - - - - - - - - - - - - - - - - - - - - - - - - - - -

Modeling variance components

- - - -
-

Introduction

-

This vignette provides a detailed guide to specific functions in the -BGmisc package that aid in the identification and fitting -of variance component models common in behavior genetics. We will -explore key functions such as identifyComponentModel, -providing practical examples and theoretical background. Identification -ensures a unique set of parameters that define the model-implied -covariance matrix, preventing free parameters from trading off one -another.

-
-

Loading Required Libraries

-

Ensure that the BGmisc package is installed and -loaded.

-

Ensure that the following dependencies are installed before -proceeding as they provide us with behavior genetic data and models:

-
    -
  • EasyMx

  • -
  • OpenMx

  • -
-
library(BGmisc)
-library(EasyMx)
-library(OpenMx)
-

Note: If any of the libraries are not installed, you can install them -using install.packages(“package_name”).

-
-
-
-

Working with Variance Component Models

-

In this section, we will demonstrate core functions related to the -identification and fitting of variance component models.

-
-

Using comp2vech Function

-

The comp2vech function is used to vectorize a components -model. The function is often used in conjunction with the identification -process. In this example, we apply it to a list of matrices:

-
comp2vech(list(
-  matrix(c(1, .5, .5, 1), 2, 2),
-  matrix(1, 2, 2)
-))
-#> [1] 1.0 0.5 1.0 1.0 1.0 1.0
-

The result showcases how the matrices have been transformed, -reflecting their role in subsequent variance component analysis.

-
-
-

Using identifyComponentModel Function

-

The identifyComponentModel function helps determine if a -variance components model is identified. It accepts relatedness -component matrices and returns information about identified and -non-identified parameters.

-

Here’s an example using the classical twin model with only MZ -twins:

-
identifyComponentModel(
-  A = list(matrix(1, 2, 2)),
-  C = list(matrix(1, 2, 2)),
-  E = diag(1, 2)
-)
-#> Component model is not identified.
-#> Non-identified parameters are  A, C
-#> $identified
-#> [1] FALSE
-#> 
-#> $nidp
-#> [1] "A" "C"
-

As you can see, the model is not identified. We need to add an -additional group so that we have sufficient information. Let us add the -rest of the classical twin model, in this case DZ twins.

-
identifyComponentModel(
-  A = list(matrix(c(1, .5, .5, 1), 2, 2), matrix(1, 2, 2)),
-  C = list(matrix(1, 2, 2), matrix(1, 2, 2)),
-  E = diag(1, 4)
-)
-#> Component model is identified.
-#> $identified
-#> [1] TRUE
-#> 
-#> $nidp
-#> character(0)
-

As you can see the model is identified, now that we’ve added another -group. Let us confirm by fitting a model. First we prepare the data.

-
library(dplyr)
-
-
-selVars <- c("ht1", "ht2")
-
-mzdzData <- subset(
-  twinData, zyg %in% c(1, 3),
-  c(selVars, "zyg")
-)
-
-mzdzData$RCoef <- c(1, NA, .5)[mzdzData$zyg]
-
-
-mzData <- mzdzData %>% filter(zyg == 1)
-

Let us fit the data with MZ twins by themselves.

-
if (!requireNamespace("EasyMx", quietly = TRUE)) {
-  print("Please install EasyMx to run the model fitting examples.")
-} else {
-  library(EasyMx)
-  run1 <- emxTwinModel(
-    model = "Cholesky",
-    relatedness = "RCoef",
-    data = mzData,
-    use = selVars,
-    run = TRUE, name = "TwCh"
-  )
-
-  summary(run1)
-}
-#> Running TwCh with 4 parameters
-#> Summary of TwCh 
-#>  
-#> free parameters:
-#>      name matrix row col   Estimate    Std.Error A lbound ubound
-#> 1 sqrtA11  sqrtA   1   1 0.05122646           NA    1e-06       
-#> 2 sqrtC11  sqrtC   1   1 0.03518629           NA       0!       
-#> 3 sqrtE11  sqrtE   1   1 0.02325722 0.0007017955 !     0!       
-#> 4    Mht1  Means ht1   1 1.62974908 0.0027023907                
-#> 
-#> Model Statistics: 
-#>                |  Parameters  |  Degrees of Freedom  |  Fit (-2lnL units)
-#>        Model:              4                   1112             -3693.148
-#>    Saturated:              5                   1111                    NA
-#> Independence:              4                   1112                    NA
-#> Number of observations/statistics: 569/1116
-#> 
-#> Information Criteria: 
-#>       |  df Penalty  |  Parameters Penalty  |  Sample-Size Adjusted
-#> AIC:      -5917.148              -3685.148                -3685.078
-#> BIC:     -10747.543              -3667.773                -3680.471
-#> To get additional fit indices, see help(mxRefModels)
-#> timestamp: 2026-01-24 22:17:23 
-#> Wall clock time: 0.05765486 secs 
-#> optimizer:  SLSQP 
-#> OpenMx version number: 2.22.10 
-#> Need help?  See help(mxSummary)
-

As you can see the model was unsuccessful because it was not -identified. But when we add another group, so that the model is -identified, the model now fits.

-
if (!requireNamespace("EasyMx", quietly = TRUE)) {
-  print("Please install EasyMx to run the model fitting examples.")
-} else {
-  library(EasyMx)
-  run2 <- emxTwinModel(
-    model = "Cholesky",
-    relatedness = "RCoef",
-    data = mzdzData,
-    use = selVars,
-    run = TRUE, name = "TwCh"
-  )
-
-  summary(run2)
-}
-#> Running TwCh with 4 parameters
-#> Summary of TwCh 
-#>  
-#> free parameters:
-#>      name matrix row col   Estimate    Std.Error A lbound ubound
-#> 1 sqrtA11  sqrtA   1   1 0.06339271 0.0014377690    1e-06       
-#> 2 sqrtC11  sqrtC   1   1 0.00000100 0.0250260004 !     0!       
-#> 3 sqrtE11  sqrtE   1   1 0.02330040 0.0007015267       0!       
-#> 4    Mht1  Means ht1   1 1.63295540 0.0020511844                
-#> 
-#> Model Statistics: 
-#>                |  Parameters  |  Degrees of Freedom  |  Fit (-2lnL units)
-#>        Model:              4                   1803             -5507.092
-#>    Saturated:              5                   1802                    NA
-#> Independence:              4                   1803                    NA
-#> Number of observations/statistics: 920/1807
-#> 
-#> Information Criteria: 
-#>       |  df Penalty  |  Parameters Penalty  |  Sample-Size Adjusted
-#> AIC:      -9113.092              -5499.092                -5499.048
-#> BIC:     -17811.437              -5479.794                -5492.498
-#> To get additional fit indices, see help(mxRefModels)
-#> timestamp: 2026-01-24 22:17:23 
-#> Wall clock time: 0.04731297 secs 
-#> optimizer:  SLSQP 
-#> OpenMx version number: 2.22.10 
-#> Need help?  See help(mxSummary)
-
-
- - - - - - - - - - - diff --git a/vignettes/v2_pedigree.Rmd b/vignettes/v2_pedigree.Rmd index 5f0f3698..b68b21f2 100644 --- a/vignettes/v2_pedigree.Rmd +++ b/vignettes/v2_pedigree.Rmd @@ -2,7 +2,7 @@ title: "Pedigree Simulation and Visualization with BGmisc" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Pedigree} + %\VignetteIndexEntry{Pedigree Simulation and Visualization} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- diff --git a/vignettes/v2_pedigree.html b/vignettes/v2_pedigree.html deleted file mode 100644 index b8f34803..00000000 --- a/vignettes/v2_pedigree.html +++ /dev/null @@ -1,487 +0,0 @@ - - - - - - - - - - - - - - -Pedigree Simulation and Visualization with BGmisc - - - - - - - - - - - - - - - - - - - - - - - - - - -

Pedigree Simulation and Visualization with -BGmisc

- - - -
-

Introduction

-

Unlike Tolstoy, where only happy families are alike, all -pedigrees are alike – or at least, all simulated pedigrees are alike. -The simulatePedigree function generates a pedigree with a -user-specified number of generations and individuals per generation. -This function provides users the opportunity to test family models in -pedigrees with a customized pedigree length and width.

-

These pedigrees can be simulated as a function of several parameters, -including the number of children per mate, generations, sex ratio of -newborns, and mating rate. Given that large family pedigrees are -difficult to collect or access, simulated pedigrees serve as an -efficient tool for researchers. These simulated pedigrees are useful for -building family-based statistical models, and evaluating their -statistical properties, such as power, bias, and computational -efficiency.

-

To illustrate this functionality, let us generate a pedigree. This -pedigree has a total of four generations (Ngen), in which -each person who “mates”, grows a family with four offspring -(kpc). In our scenario, the number of male and female -newborns is equal, but can be adjusted via (sexR). In this -illustration 70% of individuals will mate and bear offspring -(marR). Such a pedigree structure can be simulated by -running the following code:

-
## Loading Required Libraries
-library(BGmisc)
-library(ggpedigree)
-set.seed(5)
-df_ped <- simulatePedigree(
-  kpc = 4,
-  Ngen = 4,
-  sexR = .5,
-  marR = .7
-)
-summary(df_ped)
-#>      fam                  ID             gen            dadID      
-#>  Length:57          Min.   :10101   Min.   :1.000   Min.   :10102  
-#>  Class :character   1st Qu.:10306   1st Qu.:3.000   1st Qu.:10204  
-#>  Mode  :character   Median :10320   Median :3.000   Median :10307  
-#>                     Mean   :10342   Mean   :3.298   Mean   :10263  
-#>                     3rd Qu.:10416   3rd Qu.:4.000   3rd Qu.:10311  
-#>                     Max.   :10432   Max.   :4.000   Max.   :10320  
-#>                                                     NA's   :13     
-#>      momID            spID           sex           
-#>  Min.   :10101   Min.   :10101   Length:57         
-#>  1st Qu.:10202   1st Qu.:10205   Class :character  
-#>  Median :10306   Median :10306   Mode  :character  
-#>  Mean   :10263   Mean   :10266                     
-#>  3rd Qu.:10316   3rd Qu.:10311                     
-#>  Max.   :10318   Max.   :10320                     
-#>  NA's   :13      NA's   :33
-

The simulation output is a data.frame with 57 rows and 7 -columns. Each row corresponds to a simulated individual.

-
df_ped[21, ]
-#>      fam    ID gen dadID momID  spID sex
-#> 21 fam 1 10312   3 10204 10202 10317   M
-

The columns represents the individual’s family ID, the individual’s -personal ID, the generation the individual is in, the IDs of their -father and mother, the ID of their spouse, and the biological sex of the -individual, respectively.

-
-

Summarizing Pedigrees

-
summarizeFamilies(df_ped, famID = "fam")$family_summary
-#>       fam count gen_mean gen_median gen_min gen_max    gen_sd spID_mean
-#>    <char> <int>    <num>      <num>   <num>   <num>     <num>     <num>
-#> 1:  fam 1    57 3.298246          3       1       4 0.8229935     10266
-#>    spID_median spID_min spID_max  spID_sd
-#>          <num>    <num>    <num>    <num>
-#> 1:     10305.5    10101    10320 68.79206
-
-
-

Plotting Pedigree

-

Pedigrees are visual diagrams that represent family relationships -across generations. They are commonly used in genetics to trace the -inheritance of specific traits or conditions. This vignette will guide -you through visualizing simulated pedigrees using the -plotPedigree function. This function is a wrapper function -for Kinship2’s base R plotting. The sister package -ggpedigree has a much nicer plotting function. It’s also available on -CRAN, but it is not a dependency of BGmisc. If you want to use -ggpedigree, you can install it with -install.packages("ggpedigree") and then use -ggplot2 syntax to plot pedigrees.

-
-

Single Pedigree Visualization

-

To visualize a single simulated pedigree, use the the -plotPedigree function allows you to visualize the pedigree -structure, including family relationships and individual -characteristics. The plot displays individuals across generations, with -lines connecting parents to their children, and spouses connected by -horizontal lines.

-
library(ggpedigree)
-
-df_ped_recoded <- recodeSex(df_ped, code_male = "M", recode_male = 1, recode_female = 0)
-
-ggpedigree::ggpedigree(df_ped_recoded,
-  personID = "ID",
-  code_male = 1
-)
-

-

In the resulting plot, biological males are represented by squares, -while biological females are represented by circles, following the -standard pedigree conventions.

-
-
-

Visualizing Multiple Pedigrees Side-by-Side

-

If you wish to compare different pedigrees side by side, you can plot -them together. For instance, let’s visualize pedigrees for families -spanning three and four generations, respectively.

-
set.seed(8)
-# Simulate a family with 3 generations
-df_ped_3 <- simulatePedigree(Ngen = 3)
-
-# Simulate a family with 4 generations
-df_ped_4 <- simulatePedigree(Ngen = 4)
-

You can use the ggpedigree package to plot multiple -pedigrees side by side. This package allows for more customization and -better aesthetics in pedigree visualization.

-

-

By examining the side-by-side plots, you can contrast and analyze the -structures of different families, tracing the inheritance of specific -traits or conditions if needed.

-
-
-
- - - - - - - - - - - diff --git a/vignettes/v3_analyticrelatedness.html b/vignettes/v3_analyticrelatedness.html deleted file mode 100644 index 7be26475..00000000 --- a/vignettes/v3_analyticrelatedness.html +++ /dev/null @@ -1,986 +0,0 @@ - - - - - - - - - - - - - - -Understanding and Computing Relatedness from Pedigree Data - - - - - - - - - - - - - - - - - - - - - - - - - - -

Understanding and Computing Relatedness -from Pedigree Data

- - - -
-

Introduction

-

When individuals share common ancestors, they are genetically -related: they are expected to carry some proportion of alleles that are -identical by descent. This expectation—called the relatedness -coefficient—is central to many areas of genetics, including heritability -estimation, pedigree-based modeling, twin and family studies, and the -construction of kinship matrices for mixed-effects models.

-

Understanding relatedness is key for interpreting familial -resemblance, controlling for shared genetic structure in statistical -models, and simulating or analyzing traits across multigenerational -pedigrees. But while the idea that “siblings are 50% related” is -familiar, the reasoning behind such numbers—and how to compute them -across complex family structures—is less transparent.

-

This vignette introduces the concept of relatedness from first -principles and walks through how it is calculated from pedigree data. It -begins with illustrative examples that explain expected relatedness -values for familiar relationships using simplified functions. These -examples clarify how shared ancestry translates into probabilistic -expectations about genetic similarity.

-

From there, the vignette introduces a general-purpose matrix-based -method for computing pairwise relatedness across pedigrees. Using the -ped2com() function, we demonstrate how to build additive -genetic relationship matrices under both complete and incomplete -parentage, and we evaluate how different assumptions affect the -resulting estimates. The goal is to provide a clear, rigorous, and -practical guide to computing relatedness in real data.

-
-
-

Relatedness Coefficient

-

The relatedness coefficient \(r\) -indexes the proportion of alleles shared identically by descent (IBD) -between two individuals. This value ranges from 0 (no shared alleles by -descent) to 1 (a perfect genetic match, which occurs when comparing an -individual to themselves, their identical twin, or their clone). Values -can be interpreted in the context of standard relationships: e.g., full -siblings are expected to have \(r = -0.5\), half siblings \(r = -0.25\), and first cousins \(r = -0.125\).

-

Wright’s (1922) classic formulation computes 𝑟 by summing across -shared ancestry paths:

-

\[ -r_{bc} = \sum \left(\frac{1}{2}\right)^{n+n'+1} (1+f_a) -\]

-

Here, \(n\) and \(n'\) are the number of generations from -each descendant to a common ancestor \(a\), and \(f_a\) is the inbreeding coefficient of -\(a\), assumed to be zero unless -specified otherwise.

-

The function calculateRelatedness computes the -relatedness coefficient based on the number of generations back to -common ancestors, whether the individuals are full siblings, and other -parameters. The function can be used to calculate relatedness for -various family structures, including full siblings, half siblings, and -cousins.

-
library(BGmisc)
-# Example usage:
-# For full siblings, the relatedness coefficient is expected to be 0.5:
-calculateRelatedness(generations = 1, full = TRUE)
-#> [1] 0.5
-# For half siblings, the relatedness coefficient is expected to be 0.25:
-calculateRelatedness(generations = 1, full = FALSE)
-#> [1] 0.25
-

The logic reflects the number and type of shared parents. For full -siblings, both parents are shared (generational paths = 1 + 1), while -for half siblings only one is (effectively halving the probability of -sharing an allele). In otherwords, when full = TRUE, each -sibling is one generation from the shared pair of parents, yielding -r=0.5. When full = FALSE, they share only one -parent, yielding r=0.25.

-
-
-

Inferring r from Observed Phenotypic Correlation

-

In some cases, you observe a phenotypic correlation (e.g., height, -cognition) between two individuals and want to infer what value of r -would be consistent with that correlation under a fixed ACE model

-

The inferRelatedness function inverts the equation:

-

\[ -\text{obsR} = r \cdot a^2 + \text{sharedC} \cdot c^2 -\]

-

to solve for:

-

\[ -r = \frac{\text{obsR} - \text{sharedC} \cdot c^2}{a^2} -\]

-

where: - obsR is the observed phenotypic correlation -between two individuals or groups. - aceA and -aceC represent the proportions of variance due to additive -genetic and shared environmental influences, respectively. - -sharedC is the shared-environment analog to the relatedness -coefficient: it indicates what proportion of the shared environmental -variance applies to this pair (e.g., 1 for siblings raised together, 0 -for siblings raised apart).

-
# Example usage:
-# Infer the relatedness coefficient:
-inferRelatedness(obsR = 0.5, aceA = 0.9, aceC = 0, sharedC = 0)
-#> [1] 0.5555556
-

In this example, the observed correlation is 0.5, and no shared -environmental variance is assumed. Given that additive genetic variance -accounts for 90% of trait variance, the inferred relatedness coefficient -is approximately 0.556. This reflects the proportion of genetic overlap -that would be required to produce the observed similarity under these -assumptions.

-
# Now assume shared environment is fully shared:
-inferRelatedness(obsR = 0.5, aceA = 0.45, aceC = 0.45, sharedC = 1)
-#> [1] 0.1111111
-

In this case, the observed phenotypic correlation is still 0.5, and -both additive genetic and shared environmental components are assumed to -explain 45% of the variance. Because the shared environment is fully -shared between individuals (sharedC = 1), much of the observed -similarity is attributed to C, leaving only a small portion attributable -to genetic relatedness. The function returns an inferred relatedness -coefficient of approximately 0.11 — that is, the amount of additive -genetic overlap required (under this model) to produce the remaining -unexplained correlation after accounting for shared environmental -similarity.

-
-
-

Computing Relatedness from Pedigree Data

-

The ped2com function computes relationship matrices from -pedigree data using a recursive algorithm based on parent-offspring -connections. Central to this computation is the parent -adjacency matrix, which defines how individuals in the -pedigree are connected across generations. The adjacency matrix acts as -the structural input from which genetic relatedness is propagated.

-

The function offers two methods for constructing this matrix:

-
    -
  1. The classic method, which assumes that all parents are known and -that the adjacency matrix is complete.
  2. -
  3. The partial parent method, which allows for missing values in the -parent adjacency matrix.
  4. -
-

When parent data are complete, both methods return equivalent -results. But when parental information is missing, their behavior -diverges. This vignette illustrates how and why these differences -emerge, and under what conditions the partial method provides more -accurate results.

-
-

Hazard Data Example

-

We begin with the hazard dataset. First, we examine -behavior under complete pedigree data.

-
library(BGmisc)
-library(ggpedigree)
-data(hazard)
-
-df <- hazard |> dplyr::rename(personID = ID) # this is the data that we will use for the example
-
-# Plot the pedigree to visualize relationships
-
-ggpedigree(df, config = list(
-  personID = "personID",
-  momID = "momID",
-  dadID = "dadID",
-  famID = "famID",
-  code_male = 0
-))
-#> Warning in buildPlotConfig(default_config = default_config, config = config, :
-#> The following config values are not recognized by getDefaultPlotConfig():
-#> momID, dadID, famID
-

-

We compute the additive genetic relationship matrix using both the -classic and partial parent methods. Because the pedigree is complete, we -expect no differences in the resulting matrices.

-
ped_add_partial_complete <- ped2com(df,
-  isChild_method = "partialparent",
-  component = "additive",
-  adjacency_method = "direct",
-  sparse = FALSE
-)
-ped_add_classic_complete <- ped2com(df,
-  isChild_method = "classic",
-  component = "additive", adjacency_method = "direct",
-  sparse = FALSE
-)
-

The following plots display the full additive matrices. These -matrices should be identical.

-

This can be confirmed visually and numerically.

-
library(ggpedigree)
-ggRelatednessMatrix(as.matrix(ped_add_classic_complete),
-  config =
-    list(title = "Additive component - Classic method")
-)
-#> Warning in buildPlotConfig(default_config = default_config, config = config, :
-#> The following config values are not recognized by getDefaultPlotConfig(): title
-

-

-ggRelatednessMatrix(as.matrix(ped_add_partial_complete),
-  config =
-    list(title = "Additive component - Partial parent method")
-)
-#> Warning in buildPlotConfig(default_config = default_config, config = config, :
-#> The following config values are not recognized by getDefaultPlotConfig(): title
-

-

To verify this, we subtract one matrix from the other and calculate -RMSE. The difference should be numerically zero. Indeed, it is 0.

-
library(corrplot)
-#> corrplot 0.95 loaded
-corrplot((as.matrix(ped_add_classic_complete) - as.matrix(ped_add_partial_complete)),
-  method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE, order = "hclust",
-  tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2,
-  col = COL1("Reds", 100), mar = c(0, 0, 2, 0)
-)
-

-
-
-

Introducing Missingness: Remove a Parent

-

To observe how the two methods diverge when data are incomplete, we -remove one parent—starting with the mother of individual 4.

-
df$momID[df$ID == 4] <- NA
-
ped_add_partial_mom <- ped_add_partial <- ped2com(df,
-  isChild_method = "partialparent",
-  component = "additive",
-  adjacency_method = "direct",
-  sparse = FALSE
-)
-
-ped_add_classic_mom <- ped_add_classic <- ped2com(df,
-  isChild_method = "classic",
-  component = "additive", adjacency_method = "direct",
-  sparse = FALSE
-)
-

The two methods now treat individual 4 differently in the parent -adjacency matrix. The classic method applies a fixed contribution -because one parent remains. The partial parent method inflates the -individual’s diagonal contribution to account for the missing -parent.

-

The resulting additive matrices reflect this difference. The RMSE -between the two matrices is 0.

-
corrplot(as.matrix(ped_add_classic),
-  method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE, title = "Classic (mother removed)",
-  order = "hclust",
-  tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2,
-  col = COL1("Reds", 100), mar = c(0, 0, 2, 0)
-)
-

-
corrplot(as.matrix(ped_add_partial),
-  method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE, title = "Partial (mother removed)",
-  order = "hclust",
-  tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2,
-  col = COL1("Reds", 100), mar = c(0, 0, 2, 0)
-)
-

-

We quantify the overall matrix difference:

-
sqrt(mean((as.matrix(ped_add_classic) - as.matrix(ped_add_partial))^2))
-#> [1] 0
-

Next, we compare each method to the matrix from the complete -pedigree. This evaluates how much each method deviates from the correct -additive structure.

-
corrplot(as.matrix(ped_add_classic_complete) - as.matrix(ped_add_classic),
-  method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE,
-  order = "hclust",
-  tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2,
-  col = COL1("Reds", 100), mar = c(0, 0, 2, 0)
-)
-#> Warning in corrplot(as.matrix(ped_add_classic_complete) -
-#> as.matrix(ped_add_classic), : col.lim interval too wide, please set a suitable
-#> value
-

-

-sqrt(mean((ped_add_classic_complete - ped_add_classic)^2))
-#> [1] 0
-

The RMSE between the true additive component and the classic method -is 0.

-
corrplot(as.matrix(ped_add_classic_complete - ped_add_partial),
-  method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE,
-  order = "hclust",
-  tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2,
-  col = COL1("Reds", 100), mar = c(0, 0, 2, 0)
-)
-#> Warning in corrplot(as.matrix(ped_add_classic_complete - ped_add_partial), :
-#> col.lim interval too wide, please set a suitable value
-

-

-sqrt(mean((ped_add_classic_complete - ped_add_partial)^2))
-#> [1] 0
-

The RMSE between the true additive component and the partial parent -method is 0.

-

The partial method shows smaller deviations from the complete matrix, -confirming that it better preserves relatedness structure when one -parent is missing.

-
-

Removing the Father Instead

-

We now repeat the same process, this time removing the father of -individual 4.

-
data(hazard)
-
-df <- hazard # this is the data that we will use for the example
-
-
-df$dadID[df$ID == 4] <- NA
-# add
-ped_add_partial_dad <- ped_add_partial <- ped2com(df,
-  isChild_method = "partialparent",
-  component = "additive",
-  adjacency_method = "direct",
-  sparse = FALSE
-)
-
-ped_add_classic_dad <- ped_add_classic <- ped2com(df,
-  isChild_method = "classic",
-  component = "additive", adjacency_method = "direct",
-  sparse = FALSE
-)
-

As we can see, the two matrices are different. The RMSE between the -two matrices is 0.009811.

-
corrplot(as.matrix(ped_add_classic_dad),
-  method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE, title = "Classic (father removed)",
-  order = "hclust",
-  tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2,
-  col = COL1("Reds", 100), mar = c(0, 0, 2, 0)
-)
-

-

-corrplot(as.matrix(ped_add_partial_dad),
-  method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE, title = "Partial (father removed)",
-  order = "hclust",
-  tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2,
-  col = COL1("Reds", 100), mar = c(0, 0, 2, 0)
-)
-

-

Again, we compare to the true matrix from the complete pedigree:

-
corrplot(as.matrix(ped_add_classic_complete - ped_add_classic),
-  method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE,
-  order = "hclust",
-  tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2,
-  col = COL1("Reds", 100), mar = c(0, 0, 2, 0)
-)
-

-

-sqrt(mean((ped_add_classic_complete - ped_add_classic)^2))
-#> [1] 0.02991371
-
corrplot(as.matrix(ped_add_classic_complete - ped_add_partial),
-  method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE,
-  order = "hclust",
-  tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2,
-  col = COL1("Reds", 100), mar = c(0, 0, 2, 0)
-)
-

-

-sqrt(mean((ped_add_classic_complete - ped_add_partial)^2))
-#> [1] 0.02825904
-

The partial parent method again yields a matrix closer to the -full-data version.

-
-
-
-

Inbreeding Dataset: Family-Level Evaluation

-

To generalize the comparison across a larger and more varied set of -pedigrees, we use the inbreeding dataset. Each family in -this dataset is analyzed independently.

-
data("inbreeding")
-
-df <- inbreeding
-
-famIDs <- unique(df$famID)
-

For each one, we construct the additive relationship matrix under -complete information and then simulate two missingness scenarios:

-
    -
  • Missing mother: One individual with a known mother is randomly -selected, and the mother’s ID is set to NA.

  • -
  • Missing father: Similarly, one individual with a known father is -selected, and the father’s ID is set to NA.

  • -
-

In each condition, we recompute the additive matrix using both the -classic and partial parent methods. We then calculate the RMSE between -each estimate and the matrix from the complete pedigree. This allows us -to quantify which method more accurately reconstructs the original -relatedness structure when parental data are partially missing.

-
inbreeding_list <- list()
-results <- data.frame(
-  famIDs = famIDs,
-  RMSE_partial_dad = rep(NA, length(famIDs)),
-  RMSE_partial_mom = rep(NA, length(famIDs)),
-  RMSE_classic_dad = rep(NA, length(famIDs)),
-  RMSE_classic_mom = rep(NA, length(famIDs)),
-  max_R_classic_dad = rep(NA, length(famIDs)),
-  max_R_partial_dad = rep(NA, length(famIDs)),
-  max_R_classic_mom = rep(NA, length(famIDs)),
-  max_R_partial_mom = rep(NA, length(famIDs)),
-  max_R_classic = rep(NA, length(famIDs))
-)
-

The loop below performs this procedure for all families in the -dataset and stores both the RMSEs and the maximum relatedness -values.

-
for (i in seq_len(famIDs)) {
-  # make three versions to filter down
-  df_fam_dad <- df_fam_mom <- df_fam <- df[df$famID == famIDs[i], ]
-
-  results$RMSE_partial_mom[i] <- sqrt(mean((ped_add_classic_complete - ped_add_partial_mom)^2))
-
-
-  ped_add_partial_complete <- ped2com(df_fam,
-    isChild_method = "partialparent",
-    component = "additive",
-    adjacency_method = "direct",
-    sparse = FALSE
-  )
-
-  ped_add_classic_complete <- ped2com(df_fam,
-    isChild_method = "classic",
-    component = "additive",
-    adjacency_method = "direct",
-    sparse = FALSE
-  )
-
-
-  # select first ID with a mom and dad
-  momid_to_cut <- head(df_fam$ID[!is.na(df_fam$momID)], 1)
-  dadid_to_cut <- head(df_fam$ID[!is.na(df_fam$dadID)], 1)
-
-  df_fam_dad$dadID[df_fam$ID == dadid_to_cut] <- NA
-
-  df_fam_mom$momID[df_fam$ID == momid_to_cut] <- NA
-
-  ped_add_partial_dad <- ped2com(df_fam_dad,
-    isChild_method = "partialparent",
-    component = "additive",
-    adjacency_method = "direct",
-    sparse = FALSE
-  )
-  ped_add_classic_dad <- ped2com(df_fam_dad,
-    isChild_method = "classic",
-    component = "additive", adjacency_method = "direct",
-    sparse = FALSE
-  )
-
-  results$RMSE_partial_dad[i] <- sqrt(mean((ped_add_classic_complete - ped_add_partial_dad)^2))
-  results$RMSE_classic_dad[i] <- sqrt(mean((ped_add_classic_complete - ped_add_classic_dad)^2))
-  results$max_R_classic_dad[i] <- max(as.matrix(ped_add_classic_dad))
-  results$max_R_partial_dad[i] <- max(as.matrix(ped_add_partial_dad))
-
-
-  ped_add_partial_mom <- ped2com(df_fam_mom,
-    isChild_method = "partialparent",
-    component = "additive",
-    adjacency_method = "direct",
-    sparse = FALSE
-  )
-
-  ped_add_classic_mom <- ped2com(df_fam_mom,
-    isChild_method = "classic",
-    component = "additive", adjacency_method = "direct",
-    sparse = FALSE
-  )
-
-  results$RMSE_partial_mom[i] <- sqrt(mean((ped_add_classic_complete - ped_add_partial_mom)^2))
-  results$RMSE_classic_mom[i] <- sqrt(mean((ped_add_classic_complete - ped_add_classic_mom)^2))
-  results$max_R_classic_mom[i] <- max(as.matrix(ped_add_classic_mom))
-  results$max_R_partial_mom[i] <- max(as.matrix(ped_add_partial_mom))
-  results$max_R_classic[i] <- max(as.matrix(ped_add_classic_complete))
-
-  inbreeding_list[[i]] <- list(
-    df_fam = df_fam,
-    ped_add_partial_complete = ped_add_partial_complete,
-    ped_add_classic_complete = ped_add_classic_complete,
-    ped_add_partial_dad = ped_add_partial_dad,
-    ped_add_classic_dad = ped_add_classic_dad,
-    ped_add_partial_mom = ped_add_partial_mom,
-    ped_add_classic_mom = ped_add_classic_mom
-  )
-}
-#> Warning in seq_len(famIDs): first element used of 'length.out' argument
-
-

Example: Family 1

-

To understand what these matrices look like, we visualize them for -one representative family. For this example, we select the first family -in the dataset.

-

-
# pull the first family from the list
-fam1 <- inbreeding_list[[1]]
-
-corrplot(as.matrix(fam1$ped_add_classic_complete),
-  method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE, title = "Classic - Complete",
-  order = "hclust",
-  tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2,
-  col = COL1("Reds", 100), mar = c(0, 0, 2, 0)
-)
-

-

-corrplot(as.matrix(fam1$ped_add_classic_mom),
-  method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE, title = "Classic - Mom Missing",
-  order = "hclust",
-  tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2,
-  col = COL1("Reds", 100), mar = c(0, 0, 2, 0)
-)
-

-

-corrplot(as.matrix(fam1$ped_add_partial_mom),
-  method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE, title = "Partial - Mom Missing",
-  order = "hclust",
-  tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2,
-  col = COL1("Reds", 100), mar = c(0, 0, 2, 0)
-)
-

-

-corrplot(as.matrix(fam1$ped_add_classic_dad),
-  method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE, title = "Classic - Dad Missing",
-  order = "hclust",
-  tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2,
-  col = COL1("Reds", 100), mar = c(0, 0, 2, 0)
-)
-

-

-corrplot(as.matrix(fam1$ped_add_partial_dad),
-  method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE, title = "Partial - Dad Missing",
-  order = "hclust",
-  tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2,
-  col = COL1("Reds", 100), mar = c(0, 0, 2, 0)
-)
-

-

To visualize the differences from the true matrix:

-
corrplot(as.matrix(fam1$ped_add_classic_complete - fam1$ped_add_classic_mom),
-  method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE, title = "Classic Mom Diff from Complete",
-  order = "hclust",
-  tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2,
-  col = COL1("Reds", 100), mar = c(0, 0, 2, 0)
-)
-

-

-corrplot(as.matrix(fam1$ped_add_classic_complete - fam1$ped_add_partial_mom),
-  method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE, title = "Partial Mom Diff from Complete",
-  order = "hclust",
-  tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2,
-  col = COL1("Reds", 100), mar = c(0, 0, 2, 0)
-)
-

-

-corrplot(as.matrix(fam1$ped_add_classic_complete - fam1$ped_add_classic_dad),
-  method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE, title = "Classic Dad Diff from Complete",
-  order = "hclust",
-  tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2,
-  col = COL1("Reds", 100), mar = c(0, 0, 2, 0)
-)
-

-

-corrplot(as.matrix(fam1$ped_add_classic_complete - fam1$ped_add_partial_dad),
-  method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE, title = "Partial Dad Diff from Complete",
-  order = "hclust",
-  tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2,
-  col = COL1("Reds", 100), mar = c(0, 0, 2, 0)
-)
-

-

These plots show how each method responds to missing data, and -whether it maintains consistency with the complete pedigree. We observe -that the partial parent method typically introduces smaller deviations. -If desired, this same diagnostic can be repeated for additional -families, such as inbreeding_list[[2]].

-
-
-
-

Summary

-

Across all families in the inbreeding dataset, the results show a -consistent pattern: the partial parent method outperforms the classic -method in reconstructing the additive genetic relationship matrix when -either a mother or a father is missing.

-

To make this explicit, we calculate the RMSE difference between -methods. A positive value means that the partial method had lower RMSE -(i.e., better accuracy) than the classic method:

-
results <- as.data.frame(results)
-
-results$RMSE_diff_dad <- results$RMSE_classic_dad - results$RMSE_partial_dad
-results$RMSE_diff_mom <- results$RMSE_classic_mom - results$RMSE_partial_mom
-

We can then summarize the pattern across families:

-
summary(dplyr::select(results, RMSE_diff_mom, RMSE_diff_dad))
-#>  RMSE_diff_mom      RMSE_diff_dad     
-#>  Min.   :0.002127   Min.   :0.002127  
-#>  1st Qu.:0.002127   1st Qu.:0.002127  
-#>  Median :0.002127   Median :0.002127  
-#>  Mean   :0.002127   Mean   :0.002127  
-#>  3rd Qu.:0.002127   3rd Qu.:0.002127  
-#>  Max.   :0.002127   Max.   :0.002127  
-#>  NA's   :7          NA's   :7
-

In all families, both RMSE_diff_mom and -RMSE_diff_dad are positive—indicating that the classic -method produces larger the errors relative to the partial method. This -holds regardless of whether the missing parent is a mother or a -father.

-

To verify this directly:

-
mean(results$RMSE_diff_mom > 0, na.rm = TRUE)
-#> [1] 1
-mean(results$RMSE_diff_dad > 0, na.rm = TRUE)
-#> [1] 1
-

These proportions show how often the partial method produces a lower -RMSE across the dataset. This confirms the earlier findings: when -pedigree data are incomplete, the partial parent method more faithfully -reconstructs the full-data relatedness matrix.

-
results |>
-  as.data.frame() |>
-  dplyr::select(
-    -famIDs, -RMSE_diff_mom, -RMSE_diff_dad, -max_R_classic_dad,
-    -max_R_partial_dad, -max_R_classic_mom, -max_R_partial_mom, -max_R_classic
-  ) |>
-  summary()
-#>  RMSE_partial_dad  RMSE_partial_mom  RMSE_classic_dad  RMSE_classic_mom 
-#>  Min.   :0.05634   Min.   :0.05634   Min.   :0.05846   Min.   :0.05846  
-#>  1st Qu.:0.05634   1st Qu.:0.05634   1st Qu.:0.05846   1st Qu.:0.05846  
-#>  Median :0.05634   Median :0.05634   Median :0.05846   Median :0.05846  
-#>  Mean   :0.05634   Mean   :0.05634   Mean   :0.05846   Mean   :0.05846  
-#>  3rd Qu.:0.05634   3rd Qu.:0.05634   3rd Qu.:0.05846   3rd Qu.:0.05846  
-#>  Max.   :0.05634   Max.   :0.05634   Max.   :0.05846   Max.   :0.05846  
-#>  NA's   :7         NA's   :7         NA's   :7         NA's   :7
-

This summary provides an overview of the RMSE values for each method -across families. The advantage of the partial parent method in -reconstructing relatedness was consistent when parental data are -missing.

-
-
- - - - - - - - - - - diff --git a/vignettes/v4_validation.html b/vignettes/v4_validation.html deleted file mode 100644 index b3e32994..00000000 --- a/vignettes/v4_validation.html +++ /dev/null @@ -1,948 +0,0 @@ - - - - - - - - - - - - - - -Validating and Repairing Pedigree Data with BGmisc - - - - - - - - - - - - - - - - - - - - - - - - - - -

Validating and Repairing Pedigree Data with -BGmisc

- - - -
-

Introduction

-

Working with pedigree data often involves dealing with -inconsistencies, missing information, and errors. The -BGmisc package provides tools to identify and, where -possible, repair these issues automatically. This vignette demonstrates -how to validate and clean pedigree data using BGmisc’s -validation functions.

-
-
-

Identifying and Repairing ID Issues

-

The checkIDs() function detects two types of common ID -errors in pedigree data:

-
    -
  • Between-row duplication: When two or more individuals share the same -ID
  • -
  • Within-row duplication: When an individual’s parents’ IDs are -incorrectly listed as their own ID
  • -
-

These problems are especially common when merging family records or -processing historical data. Let’s explore how they show up — and what -they imply for downstream structure.

-
-

A Clean Dataset

-

We’ll begin with the Potter family dataset, cleaned and reformatted -with ped2fam():

-
library(BGmisc)
-
-# Load our example dataset
-df <- ped2fam(potter, famID = "newFamID", personID = "personID")
-
-# Check for ID issues
-checkIDs(df, repair = FALSE)
-#> $all_unique_ids
-#> [1] TRUE
-#> 
-#> $total_non_unique_ids
-#> [1] 0
-#> 
-#> $non_unique_ids
-#> NULL
-#> 
-#> $total_own_father
-#> [1] 0
-#> 
-#> $total_own_mother
-#> [1] 0
-#> 
-#> $total_duplicated_parents
-#> [1] 0
-#> 
-#> $total_within_row_duplicates
-#> [1] 0
-#> 
-#> $within_row_duplicates
-#> [1] FALSE
-#> 
-#> $is_own_father_ids
-#> NULL
-#> 
-#> $is_own_mother_ids
-#> NULL
-#> 
-#> $duplicated_parents_ids
-#> NULL
-

There are no duplicated or self-referential IDs here. But things -rarely stay that simple.

-
-

What checkIDs() Reports

-

The checkIDs() function checks for:

-
    -
  • Whether all IDs are unique (reported by all_unique_ids, -which tells you if all IDs in the dataset are unique, and -total_non_unique_ids, which gives you the count of -non-unique IDs found)
  • -
  • Cases where someone’s ID matches their parent’s ID (shown in -total_own_father and total_own_mother, which -count individuals whose father’s or mother’s ID matches their own -ID)
  • -
  • Total duplicated parent IDs (tracked by -total_duplicated_parents, which counts individuals with -duplicated parent IDs)
  • -
  • Within-row duplicates (measured by -total_within_row_duplicates showing the count and -within_row_duplicates indicating their presence)
  • -
-

If you set repair = TRUE, the function will attempt to -fix the issues it finds. We’ll explore this later.

-
-
-
-

A Tale of Two Duplicates

-

To understand how these tools work in practice, let’s create a -dataset with two common real-world problems. First, we’ll accidentally -give Vernon Dursley the same ID as his sister Marjorie (a common issue -when merging family records). Then, we’ll add a complete duplicate of -Dudley Dursley (as might happen during data entry).

-
# Create our problematic dataset
-df_duplicates <- df
-# Sibling ID conflict
-df_duplicates$personID[df_duplicates$name == "Vernon Dursley"] <-
-  df_duplicates$personID[df_duplicates$name == "Marjorie Dursley"]
-# Duplicate entry
-df_duplicates <- rbind(
-  df_duplicates,
-  df_duplicates[df_duplicates$name == "Dudley Dursley", ]
-)
-

If we look at the data using standard tools, the problems aren’t -immediately obvious:

-
library(tidyverse)
-
-summarizeFamilies(df_duplicates,
-  famID = "newFamID",
-  personID = "personID"
-)$family_summary %>%
-  glimpse()
-#> Rows: 1
-#> Columns: 27
-#> $ newFamID        <dbl> 1
-#> $ count           <int> 37
-#> $ famID_mean      <dbl> 1
-#> $ famID_median    <dbl> 1
-#> $ famID_min       <dbl> 1
-#> $ famID_max       <dbl> 1
-#> $ famID_sd        <dbl> 0
-#> $ gen_mean        <dbl> 1.756757
-#> $ gen_median      <dbl> 2
-#> $ gen_min         <dbl> 0
-#> $ gen_max         <dbl> 3
-#> $ gen_sd          <dbl> 1.038305
-#> $ spouseID_mean   <dbl> 38.2
-#> $ spouseID_median <dbl> 15
-#> $ spouseID_min    <dbl> 1
-#> $ spouseID_max    <dbl> 106
-#> $ spouseID_sd     <dbl> 44.15118
-#> $ sex_mean        <dbl> 0.5135135
-#> $ sex_median      <dbl> 1
-#> $ sex_min         <dbl> 0
-#> $ sex_max         <dbl> 1
-#> $ sex_sd          <dbl> 0.5067117
-#> $ twinID_mean     <dbl> 12.5
-#> $ twinID_median   <dbl> 12.5
-#> $ twinID_min      <dbl> 12
-#> $ twinID_max      <dbl> 13
-#> $ twinID_sd       <dbl> 0.7071068
-

But checkIDs() detects the problems clearly:

-
# Identify duplicates
-result <- checkIDs(df_duplicates)
-print(result)
-#> $all_unique_ids
-#> [1] FALSE
-#> 
-#> $total_non_unique_ids
-#> [1] 4
-#> 
-#> $non_unique_ids
-#> [1] 2 6
-#> 
-#> $total_own_father
-#> [1] 0
-#> 
-#> $total_own_mother
-#> [1] 0
-#> 
-#> $total_duplicated_parents
-#> [1] 0
-#> 
-#> $total_within_row_duplicates
-#> [1] 0
-#> 
-#> $within_row_duplicates
-#> [1] FALSE
-#> 
-#> $is_own_father_ids
-#> NULL
-#> 
-#> $is_own_mother_ids
-#> NULL
-#> 
-#> $duplicated_parents_ids
-#> NULL
-

As we can see from this output, there are 4 non-unique IDs in the -dataset, specifically 2, 6. Let’s take a peek at the duplicates:

-
# Let's examine the problematic entries
-df_duplicates %>%
-  filter(personID %in% result$non_unique_ids) %>%
-  arrange(personID)
-#>    personID newFamID famID             name first_name surname gen momID dadID
-#> 1         2        1     1   Vernon Dursley     Vernon Dursley   1   101   102
-#> 2         2        1     1 Marjorie Dursley   Marjorie Dursley   1   101   102
-#> 6         6        1     1   Dudley Dursley     Dudley Dursley   2     3     1
-#> 61        6        1     1   Dudley Dursley     Dudley Dursley   2     3     1
-#>    spouseID sex twinID zygosity
-#> 1         3   1     NA     <NA>
-#> 2        NA   0     NA     <NA>
-#> 6        NA   1     NA     <NA>
-#> 61       NA   1     NA     <NA>
-

Yep, these are definitely the duplicates.

-
-

Repairing Between-Row Duplicates

-

Some ID issues can be fixed automatically. Let’s try the repair -option:

-
df_repair <- checkIDs(df, repair = TRUE)
-
-df_repair %>%
-  filter(ID %in% result$non_unique_ids) %>%
-  arrange(ID)
-#>   ID newFamID famID             name first_name surname gen momID dadID spID
-#> 1  2        1     1 Marjorie Dursley   Marjorie Dursley   1   101   102   NA
-#> 2  6        1     1   Dudley Dursley     Dudley Dursley   2     3     1   NA
-#>   sex twinID zygosity
-#> 1   0     NA     <NA>
-#> 2   1     NA     <NA>
-
-result <- checkIDs(df_repair)
-
-print(result)
-#> $all_unique_ids
-#> [1] TRUE
-#> 
-#> $total_non_unique_ids
-#> [1] 0
-#> 
-#> $non_unique_ids
-#> NULL
-#> 
-#> $total_own_father
-#> [1] 0
-#> 
-#> $total_own_mother
-#> [1] 0
-#> 
-#> $total_duplicated_parents
-#> [1] 0
-#> 
-#> $total_within_row_duplicates
-#> [1] 0
-#> 
-#> $within_row_duplicates
-#> [1] FALSE
-#> 
-#> $is_own_father_ids
-#> NULL
-#> 
-#> $is_own_mother_ids
-#> NULL
-#> 
-#> $duplicated_parents_ids
-#> NULL
-

Great! Notice what happened here: the function was able to repair the -full duplicate, without any manual intervention. That still leaves us -with the sibling ID conflict, but that’s a more complex issue that would -require manual intervention. We’ll leave that for now.

-

So far we’ve only checked for violations of uniqueness. But do these -errors also affect the graph structure? Let’s find out.

-
-
-
-

Oedipus ID

-

Just as Oedipus discovered his true relationship was not what records -suggested, our data can reveal its own confused parentage when an ID is -incorrectly listed as its own parent. Let’s examine this error:

-

Sometimes, an individual’s parents’ IDs may be incorrectly listed as -their own ID, leading to within-row duplicates. The checkIDs function -can also identify these errors:

-
# Create a sample dataset with within-person duplicate parent IDs
-
-df_within <- ped2fam(potter, famID = "newFamID", personID = "personID")
-
-df_within$momID[df_within$name == "Vernon Dursley"] <- df_within$personID[df_within$name == "Vernon Dursley"]
-
-# Check for within-row duplicates
-result <- checkIDs(df_within, repair = FALSE)
-print(result)
-#> $all_unique_ids
-#> [1] TRUE
-#> 
-#> $total_non_unique_ids
-#> [1] 0
-#> 
-#> $non_unique_ids
-#> NULL
-#> 
-#> $total_own_father
-#> [1] 0
-#> 
-#> $total_own_mother
-#> [1] 1
-#> 
-#> $total_duplicated_parents
-#> [1] 0
-#> 
-#> $total_within_row_duplicates
-#> [1] 1
-#> 
-#> $within_row_duplicates
-#> [1] TRUE
-#> 
-#> $is_own_father_ids
-#> NULL
-#> 
-#> $is_own_mother_ids
-#> [1] 1
-#> 
-#> $duplicated_parents_ids
-#> NULL
-

In this example, we have created a within-row duplicate by setting -the momID of Vernon Dursley to his own ID. The checkIDs -function correctly identifies that this error is present.

-

To repair within-row duplicates, you will be able to set the repair -argument to TRUE, eventually. This feature is currently -under development and will be available in future versions of the -package. In the meantime, you can manually inspect and then correct -these errors in your dataset.

-
# Find the problematic entry
-
-df_within[df_within$momID %in% result$is_own_mother_ids, ]
-#>   personID newFamID famID           name first_name surname gen momID dadID
-#> 1        1        1     1 Vernon Dursley     Vernon Dursley   1     1   102
-#>   spouseID sex twinID zygosity
-#> 1        3   1     NA     <NA>
-

There are several ways to correct this issue, depending on the -specifics of your dataset. In this case, you could correct the momID for -Vernon Dursley to the correct value, resolving the within-row duplicate, -likely by assuming that his sister Marjorie shares the same mother.

-
-
-
-

Identifying and Repairing Sex Coding Issues

-

Another critical aspect of pedigree validation is ensuring the -consistency of sex coding. This brings us to an important distinction in -genetic studies between biological sex (genotype) and gender identity -(phenotype):

-
    -
  • Biological sex (genotype) refers to an individual’s chromosomal -configuration, typically XX for female and XY for male in humans, though -variations exist.
  • -
  • Gender identity (phenotype) encompasses a broader, richer, personal, -deeply-held sense of being male, female, a blend of both, neither, or -another gender entirely.
  • -
-

The checkSex function focuses on biological sex coding -consistency, particularly looking for: - Mismatches between parent roles -and recorded sex - Individuals listed as both parent and child - -Inconsistent sex coding across the dataset

-

Let’s examine how it works:

-
# Validate sex coding
-
-results <- checkSex(potter,
-  code_male = 1,
-  code_female = 0,
-  verbose = TRUE, repair = FALSE
-)
-#> Standardizing column names...
-#> Step 1: Checking how many sexes/genders...
-#> 2 unique sex codes found: 1, 0
-#> Role: dadID
-#> 1  unique sex codes found:  1 
-#> Modal sex code:  1 
-#> All parents consistently coded.
-#> Role: momID
-#> 1  unique sex codes found:  0 
-#> Modal sex code:  0 
-#> All parents consistently coded.
-#> Checks Made:
-#> c(1, 0)21010numeric(0)numeric(0)numeric(0)numeric(0)
-print(results)
-#> $sex_unique
-#> [1] 1 0
-#> 
-#> $sex_length
-#> [1] 2
-#> 
-#> $all_sex_dad
-#> [1] 1
-#> 
-#> $all_sex_mom
-#> [1] 0
-#> 
-#> $most_frequent_sex_dad
-#> [1] 1
-#> 
-#> $most_frequent_sex_mom
-#> [1] 0
-#> 
-#> $ID_female_dads
-#> numeric(0)
-#> 
-#> $ID_child_female_dads
-#> numeric(0)
-#> 
-#> $ID_male_moms
-#> numeric(0)
-#> 
-#> $ID_child_male_moms
-#> numeric(0)
-

When inconsistencies are found, you can attempt automatic repair:

-
# Repair sex coding
-df_fix <- checkSex(potter,
-  code_male = 1,
-  code_female = 0,
-  verbose = TRUE, repair = TRUE
-)
-#> Standardizing column names...
-#> Step 1: Checking how many sexes/genders...
-#> 2 unique sex codes found: 1, 0
-#> Role: dadID
-#> 1  unique sex codes found:  1 
-#> Modal sex code:  1 
-#> All parents consistently coded.
-#> Role: momID
-#> 1  unique sex codes found:  0 
-#> Modal sex code:  0 
-#> All parents consistently coded.
-#> Step 2: Attempting to repair sex coding...
-#> Changes Made:
-#> Recode sex based on most frequent sex in dads: 1. Total sex changes made:  36
-print(df_fix)
-#>     ID famID               name first_name  surname gen momID dadID spID sex
-#> 1    1     1     Vernon Dursley     Vernon  Dursley   1   101   102    3   M
-#> 2    2     1   Marjorie Dursley   Marjorie  Dursley   1   101   102   NA   F
-#> 3    3     1      Petunia Evans    Petunia    Evans   1   103   104    1   F
-#> 4    4     1         Lily Evans       Lily    Evans   1   103   104    5   F
-#> 5    5     1       James Potter      James   Potter   1    NA    NA    4   M
-#> 6    6     1     Dudley Dursley     Dudley  Dursley   2     3     1   NA   M
-#> 7    7     1       Harry Potter      Harry   Potter   2     4     5    8   M
-#> 8    8     1      Ginny Weasley      Ginny  Weasley   2    10     9    7   F
-#> 9    9     1     Arthur Weasley     Arthur  Weasley   1    NA    NA   10   M
-#> 10  10     1      Molly Prewett      Molly  Prewett   1    NA    NA    9   F
-#> 11  11     1        Ron Weasley        Ron  Weasley   2    10     9   17   M
-#> 12  12     1       Fred Weasley       Fred  Weasley   2    10     9   NA   M
-#> 13  13     1     George Weasley     George  Weasley   2    10     9   NA   M
-#> 14  14     1      Percy Weasley      Percy  Weasley   2    10     9   20   M
-#> 15  15     1    Charlie Weasley    Charlie  Weasley   2    10     9   NA   M
-#> 16  16     1       Bill Weasley       Bill  Weasley   2    10     9   18   M
-#> 17  17     1   Hermione Granger   Hermione  Granger   2    NA    NA   11   F
-#> 18  18     1     Fleur Delacour      Fleur Delacour   2   105   106   16   F
-#> 19  19     1 Gabrielle Delacour  Gabrielle Delacour   2   105   106   NA   F
-#> 20  20     1             Audrey     Audrey  Unknown   2    NA    NA   14   F
-#> 21  21     1    James Potter II      James   Potter   3     8     7   NA   M
-#> 22  22     1       Albus Potter      Albus   Potter   3     8     7   NA   M
-#> 23  23     1        Lily Potter       Lily   Potter   3     8     7   NA   F
-#> 24  24     1       Rose Weasley       Rose  Weasley   3    17    11   NA   F
-#> 25  25     1       Hugo Weasley       Hugo  Weasley   3    17    11   NA   M
-#> 26  26     1   Victoire Weasley   Victoire  Weasley   3    18    16   NA   F
-#> 27  27     1  Dominique Weasley  Dominique  Weasley   3    18    16   NA   F
-#> 28  28     1      Louis Weasley      Louis  Weasley   3    18    16   NA   M
-#> 29  29     1      Molly Weasley      Molly  Weasley   3    20    14   NA   F
-#> 30  30     1       Lucy Weasley       Lucy  Weasley   3    20    14   NA   F
-#> 31 101     1     Mother Dursley     Mother  Dursley   0    NA    NA  102   F
-#> 32 102     1     Father Dursley     Father  Dursley   0    NA    NA  101   M
-#> 33 104     1       Father Evans     Father    Evans   0    NA    NA  103   M
-#> 34 103     1       Mother Evans     Mother    Evans   0    NA    NA  104   F
-#> 35 106     1    Father Delacour     Father Delacour   0    NA    NA  105   M
-#> 36 105     1    Mother Delacour     Mother Delacour   0    NA    NA  106   F
-#>    twinID zygosity
-#> 1      NA     <NA>
-#> 2      NA     <NA>
-#> 3      NA     <NA>
-#> 4      NA     <NA>
-#> 5      NA     <NA>
-#> 6      NA     <NA>
-#> 7      NA     <NA>
-#> 8      NA     <NA>
-#> 9      NA     <NA>
-#> 10     NA     <NA>
-#> 11     NA     <NA>
-#> 12     13       mz
-#> 13     12       mz
-#> 14     NA     <NA>
-#> 15     NA     <NA>
-#> 16     NA     <NA>
-#> 17     NA     <NA>
-#> 18     NA     <NA>
-#> 19     NA     <NA>
-#> 20     NA     <NA>
-#> 21     NA     <NA>
-#> 22     NA     <NA>
-#> 23     NA     <NA>
-#> 24     NA     <NA>
-#> 25     NA     <NA>
-#> 26     NA     <NA>
-#> 27     NA     <NA>
-#> 28     NA     <NA>
-#> 29     NA     <NA>
-#> 30     NA     <NA>
-#> 31     NA     <NA>
-#> 32     NA     <NA>
-#> 33     NA     <NA>
-#> 34     NA     <NA>
-#> 35     NA     <NA>
-#> 36     NA     <NA>
-

When the repair argument is set to TRUE, repair process -follows several rules: - Parents listed as mothers must be female - -Parents listed as fathers must be male - Sex codes are standardized to -the specified code_male and code_female values - If no sex code is -provided, the function will attempt to infer what male and female are -coded with. The most frequently assigned sex for mothers and fathers -will be used as the standard.

-

Note that automatic repairs should be carefully reviewed, as they may -not always reflect the correct biological relationships. In cases where -the sex coding is ambiguous or conflicts with known relationships, -manual inspection and domain knowledge may be required.

- -
-
-

Best Practices for Pedigree Validation

-

Through extensive work with pedigree data, we’ve learned several key -principles:

-
    -
  • Always inspect your data before applying automatic repairs
  • -
  • Use summarizeFamilies() to get an overview of family structures
  • -
  • Keep detailed records of changes made during cleaning
  • -
  • Validate after each repair step
  • -
  • Create backups before applying repairs
  • -
  • Trust your domain knowledge - automatic repairs are helpful but not -infallible
  • -
-

By following these best practices, and leveraging functions like -checkIDs, checkSex, and -recodeSex, you can ensure the integrity of your pedigree -data, facilitating accurate analysis and research.

-
- - - - - - - - - - - diff --git a/vignettes/v5_ASOIAF.Rmd b/vignettes/v5_ASOIAF.Rmd index 680d6eb3..58c051f6 100644 --- a/vignettes/v5_ASOIAF.Rmd +++ b/vignettes/v5_ASOIAF.Rmd @@ -7,7 +7,7 @@ vignette: > %\VignetteEncoding{UTF-8} --- -## Introduction +# Introduction Just how closely related are Jon Snow and Daenerys Targaryen? According to the lore of *A Song of Ice and Fire*, Daenerys is Jon's paternal aunt. This would suggest a theoretical genetic relatedness of 0.25, assuming a simple pedigree and no inbreeding. But with tangled ancestries and potentially missing information, how confident can we be in that estimate? @@ -69,7 +69,8 @@ add <- ped2com(df_got, isChild_method = "partialparent", component = "additive", adjacency_method = "direct", - sparse = TRUE + sparse = TRUE, + mz_twins = TRUE ) mt <- ped2com(df_got, @@ -86,6 +87,50 @@ cn <- ped2cn(df_got, ) ``` + +Each of these matrices is square, with dimensions equal to the number of individuals in the pedigree. The entries represent pairwise relatedness coefficients. + +To verify the matrices, we can plot their sparsity patterns: + +```{r fig.width=10, fig.height=4, message=FALSE, warning=FALSE} +par(mfrow = c(1, 3)) +ggRelatednessMatrix(as.matrix(add), + config = list( + matrix_color_palette = c("white", "gold", "red"), + color_scale_midpoint = 0.5, + matrix_cluster = TRUE, + plot_title = "Relatedness Matrix", + axis_x_label = "Individuals", + axis_y_label = "Individuals", + label_include = FALSE + ) +) +ggRelatednessMatrix(as.matrix(cn), + config = list( + matrix_color_palette = c("white", "lightblue", "blue"), + color_scale_midpoint = 0.5, + matrix_cluster = TRUE, + plot_title = "Common Nuclear Relatedness Matrix", + axis_x_label = "Individuals", + axis_y_label = "Individuals", + axis_text_size = 8 + ) +) +ggRelatednessMatrix(as.matrix(mt), + config = list( + matrix_color_palette = c("white", "lightgreen", "darkgreen"), + color_scale_midpoint = 0.5, + matrix_cluster = TRUE, + plot_title = "Mitochondrial Relatedness Matrix", + axis_x_label = "Individuals", + axis_y_label = "Individuals", + axis_text_size = 8 + ) +) + +par(mfrow = c(1, 1)) +``` + ## Convert to Pairwise Format For interpretability, we convert these square matrices into long-format tables using `com2links()`. This function returns a dataframe where each row represents a unique pair of individuals, including their additive and common nuclear coefficients. diff --git a/vignettes/v5_ASOIAF.html b/vignettes/v5_ASOIAF.html deleted file mode 100644 index 73674c37..00000000 --- a/vignettes/v5_ASOIAF.html +++ /dev/null @@ -1,672 +0,0 @@ - - - - - - - - - - - - - - -ASOIAF: How related are Jon and Danny? - - - - - - - - - - - - - - - - - - - - - - - - - - -

ASOIAF: How related are Jon and Danny?

- - - -
-

Introduction

-

Just how closely related are Jon Snow and Daenerys Targaryen? -According to the lore of A Song of Ice and Fire, Daenerys is -Jon’s paternal aunt. This would suggest a theoretical genetic -relatedness of 0.25, assuming a simple pedigree and no inbreeding. But -with tangled ancestries and potentially missing information, how -confident can we be in that estimate?

-

In this vignette, we use the BGmisc package to -reconstruct the ASOIAF pedigree from the -ggpedigree package, handle incomplete parentage data, and -compute additive genetic and common nuclear relatedness. We’ll focus on -Jon and Daenerys as a case study, but the methods generalize to any -characters in the provided dataset.

-
-
-

Load Packages and Data

-

We begin by loading the required libraries and examining the -structure of the built-in ASOIAF pedigree.

-
library(BGmisc)
-library(tidyverse)
-library(ggpedigree)
-data(ASOIAF, package = "ggpedigree")
-

The ASOIAF dataset includes character IDs, names, family identifiers, -and parent identifiers for a subset of characters drawn from the A -Song of Ice and Fire canon.

-
head(ASOIAF)
-
##   id famID momID dadID          name sex
-## 1  1     1   566   564   Walder Frey   M
-## 2  2     1    NA    NA   Perra Royce   F
-## 3  3     1     2     1  Stevron Frey   M
-## 4  4     1     2     1    Emmon Frey   M
-## 5  5     1     2     1    Aenys Frey   M
-## 6  6     1    NA    NA Corenna Swann   F
-##                                                   url twinID zygosity
-## 1   https://awoiaf.westeros.org/index.php/Walder_Frey     NA     <NA>
-## 2   https://awoiaf.westeros.org/index.php/Perra_Royce     NA     <NA>
-## 3  https://awoiaf.westeros.org/index.php/Stevron_Frey     NA     <NA>
-## 4    https://awoiaf.westeros.org/index.php/Emmon_Frey     NA     <NA>
-## 5    https://awoiaf.westeros.org/index.php/Aenys_Frey     NA     <NA>
-## 6 https://awoiaf.westeros.org/index.php/Corenna_Swann     NA     <NA>
-
-
-

Prepare and Validate Sex Codes

-

Many pedigree-based algorithms rely on biological sex for downstream -calculationss and visualization. We use checkSex() to -inspect the sex variable, repairing inconsistencies -programmatically.

-
df_got <- checkSex(ASOIAF,
-  code_male = 1,
-  code_female = 0,
-  verbose = FALSE, repair = TRUE
-)
-
-
-

Compute Relatedness Matrices

-

With validated pedigree data, we can now compute two distinct -relationship matrices:

-
    -
  • Additive genetic relatedness (add): Proportion of shared additive -genetic variance between individuals.

  • -
  • Common nuclear relatedness (cn): Indicates shared full-sibling -(nuclear family) environments.

  • -
-

These are derived using ped2add() and -ped2cn(), respectively. Both functions rely on internal -graph traversal and adjacency structures. In this case:

-
    -
  • We specify isChild_method = “partialparent” to allow inclusion of -dyads where one parent is unknown.

  • -
  • We choose adjacency_method = “direct” for the additive matrix to -optimize for computational speed.

  • -
  • For the common nuclear matrix, we use adjacency_method = -“indexed”, which is slower but necessary for resolving sibling-group -structures.

  • -
  • We set sparse = TRUE to return compressed sparse -matrices rather than full (dense) formats.

  • -
-
add <- ped2com(df_got,
-  isChild_method = "partialparent",
-  component = "additive",
-  adjacency_method = "direct",
-  sparse = TRUE
-)
-
-mt <- ped2com(df_got,
-  isChild_method = "partialparent",
-  component = "mitochondrial",
-  adjacency_method = "direct",
-  sparse = TRUE
-)
-
-cn <- ped2cn(df_got,
-  isChild_method = "partialparent",
-  adjacency_method = "indexed",
-  sparse = TRUE
-)
-
-
-

Convert to Pairwise Format

-

For interpretability, we convert these square matrices into -long-format tables using com2links(). This function returns -a dataframe where each row represents a unique pair of individuals, -including their additive and common nuclear coefficients.

-
df_links <- com2links(
-  writetodisk = FALSE,
-  ad_ped_matrix = add, cn_ped_matrix = cn, mit_ped_matrix = mt,
-  drop_upper_triangular = TRUE
-) # %>%
-#  filter(ID1 != ID2)
-

The function can return the entire matrix or just the lower -triangular part, which is often sufficient for our purposes. Setting -drop_upper_triangular = TRUE ensures we only retain one -entry per dyad, since the matrices are symmetric. We also keep the data -in memory by setting writetodisk = FALSE.

-
-
-

Locate Jon and Daenerys

-

We next identify the rows in the pairwise relatedness table that -correspond to Jon Snow and Daenerys Targaryen. First, we retrieve their -individual IDs:

-
# Find the IDs of Jon Snow and Daenerys Targaryen
-
-jon_id <- df_got %>%
-  filter(name == "Jon Snow") %>%
-  pull(ID)
-
-dany_id <- df_got %>%
-  filter(name == "Daenerys Targaryen") %>%
-  pull(ID)
-

We can then filter the pairwise relatedness table to isolate the dyad -of interest:

-
jon_dany_row <- df_links %>%
-  filter(ID1 == jon_id | ID2 == jon_id) %>%
-  filter(ID1 %in% dany_id | ID2 %in% dany_id) %>% # round to nearest 4th decimal
-  mutate(across(c(addRel, mitRel, cnuRel), ~ round(.x, 4)))
-
-jon_dany_row
-
##   ID1 ID2 addRel mitRel cnuRel
-## 1 206 211 0.5031      0      0
-## 2 211 304 0.0562      0      0
-

This table contains the additive nuclear relatedness estimates for -Jon and Daenerys. If the pedigree reflects their canonical aunt-nephew -relationship and is free from… complications, we’d expect to see an -additive coefficient close to 0.25. However, the value is 0.5031, -indicating a more complex relationship and in line with how related we -would expect full siblings to be.

-

Likewise, when we examine the relatedness for a different pair, such -as Rhaenyra Targaryen and Damemon Targaryen, we can see how the -relatedness coefficients vary across different characters in the -dataset.

-
rhaenyra_id <- df_got %>%
-  filter(name == "Rhaenyra Targaryen") %>%
-  pull(ID)
-daemon_id <- df_got %>%
-  filter(name == "Daemon Targaryen") %>%
-  pull(ID)
-
-rhaenyra_daemon_row <- df_links %>%
-  filter(ID1 == rhaenyra_id | ID2 == rhaenyra_id) %>%
-  filter(ID1 %in% daemon_id | ID2 %in% daemon_id) %>% # round to 4th decimal
-  mutate(across(c(addRel, mitRel, cnuRel), ~ round(.x, 4)))
-
-rhaenyra_daemon_row
-
##   ID1 ID2 addRel mitRel cnuRel
-## 1 339 536 0.7355      1      0
-

Similarly, we can see that Rhaenyra and Daemon have an additive -relatedness coefficient of 0.7355, which is also slightly higher than -the expected 0.25 for a full uncle-neice relationship. In terms of -mitochondrial relatedness, both pairs have a coefficient of 1, -indicating that they share the same mitochondrial lineage.

-
-
-

Plotting the Pedigree with Incomplete Parental Information

-

Many real-world and fictional pedigrees contain individuals with -unknown or partially known parentage. In such cases, plotting tools -typically fail unless these gaps are handled. We use -checkParentIDs() to:

-
    -
  • Identify individuals with one known parent and one -missing

  • -
  • Create “phantom” placeholders for the missing parent

  • -
  • Optionally repair and harmonize parent fields

  • -
-

To facilitate plotting, we check for individuals with one known -parent but a missing other. For those cases, we assign a placeholder ID -to the missing parent.

-
df_repaired <- checkParentIDs(df_got, # %>% filter(famID == 1),
-  addphantoms = TRUE,
-  repair = TRUE,
-  parentswithoutrow = FALSE,
-  repairsex = FALSE
-) %>% mutate(
-  # famID = 1,
-  affected = case_when(
-    ID %in% c(jon_id, dany_id, 339) ~ TRUE,
-    TRUE ~ FALSE
-  )
-)
-
## REPAIR IN EARLY ALPHA
-

This code creates new IDs for individuals with one known parent and a -missing other. It checks if either momID or -dadID is missing, and if so, it assigns a new ID based on -the row number. This allows us to visualize the pedigree even when some -parental information is incomplete. Now we can check the repaired -pedigree for unique IDs and parent-child relationships.

-
checkIDs <- checkIDs(df_repaired, verbose = TRUE)
-
## Standardizing column names...
-
## Checking IDs...
-## Step 1: Checking for unique IDs...
-## All IDs are unique.
-## Step 2: Checking for within row duplicats...
-## No within row duplicates found.
-## Validation Results:
-
## TRUE0NULL0000FALSENULLNULLNULL
-
# checkIDs
-
# Check for unique IDs and parent-child relationships
-checkPedigreeNetwork <- checkPedigreeNetwork(df_repaired,
-  personID = "ID",
-  momID = "momID",
-  dadID = "dadID",
-  verbose = TRUE
-)
-
## No individuals with more than two parents detected.
-
## No duplicate edges detected.
-
## No cyclic relationships detected.
-
checkPedigreeNetwork
-
## $individuals_with_excess_parents
-## character(0)
-## 
-## $duplicate_edges
-##      [,1] [,2]
-## 
-## $is_acyclic
-## [1] TRUE
-

As we can see, the repaired pedigree now has unique IDs for all -individuals, and the parent-child relationships are intact. The function -checkIDs() confirms that all IDs are unique, while -checkPedigreeNetwork() verifies that there are no -structural issues like individuals with more than two parents or cyclic -relationships.

-
-
-

Visualize the Pedigree

-

We can now visualize the repaired pedigree using the -ggPedigree() function from {ggpedigree}. This function -generates a plot of the pedigree, with individuals colored based on -their affected status. In this case, we highlight Jon and Daenerys as -“affected” individuals. Otherwise they would be difficult to distinguish -from the rest of the pedigree. To make the plot more informative, we -also fill every member of the tree by how related they are to Rhaenyra -Targaryen, who is the focal individual in this case.

-

This function provides a more flexible and customizable way to -visualize pedigrees, allowing for easy integration with other -ggplot2 functions relative to kinship2’s pedigree plotting -functions.

-
library(ggpedigree)
-
-df_repaired_renamed <- df_repaired %>% rename(
-  personID = ID
-)
-plt <- ggpedigree(df_repaired_renamed,
-  overlay_column = "affected",
-  personID = "personID",
-  interactive = FALSE,
-  config = list(
-    overlay_include = TRUE,
-    point_size = .75,
-    code_male = "M",
-    ped_width = 17,
-    label_nudge_y = -.25,
-    include_labels = TRUE,
-    label_method = "geom_text",
-    # segment_self_color = "purple",
-    sex_color_include = FALSE,
-    focal_fill_personID = 353, # 339, # 353,
-    focal_fill_include = TRUE,
-    tooltip_columns = c("personID", "name", "focal_fill"),
-    focal_fill_force_zero = TRUE,
-    focal_fill_mid_color = "orange",
-    focal_fill_low_color = "#9F2A63FF",
-    focal_fill_legend_title = "Relatedness to \nAegon Targaryen",
-    focal_fill_na_value = "black",
-    value_rounding_digits = 4
-  )
-)
-
-plt
-

-
# reduce file size for CRAN
-# if (interactive()) {
-# If running interactively, use plotly::partial_bundle
-# to reduce file size for CRAN
-#  plotly::partial_bundle(plt)
-# } else {
-#  plotly::partial_bundle(plt, local = TRUE)
-# }
-

This plot can provide an interactive visualization of the ASOIAF -pedigree (if interactive is set to TRUE), allowing users to explore -relationships and affected status. The focal individual is highlighted, -and tooltips provide additional information about each character.

-
-
-

Conclusion

-

In this vignette, we demonstrated how to reconstruct and analyze the -A Song of Ice and Fire pedigree using the BGmisc -package. We computed additive and common nuclear relatedness -coefficients for key characters, revealing the complexities of their -relationships. By handling incomplete parentage data and visualizing the -pedigree, we provided a comprehensive overview of how related Jon Snow -and Daenerys Targaryen truly are.

-
- - - - - - - - - - - diff --git a/vignettes/v5_ASOIAF_files/figure-html/unnamed-chunk-5-1.png b/vignettes/v5_ASOIAF_files/figure-html/unnamed-chunk-5-1.png new file mode 100644 index 00000000..2a2ff6f8 Binary files /dev/null and b/vignettes/v5_ASOIAF_files/figure-html/unnamed-chunk-5-1.png differ diff --git a/vignettes/v5_ASOIAF_files/figure-html/unnamed-chunk-5-2.png b/vignettes/v5_ASOIAF_files/figure-html/unnamed-chunk-5-2.png new file mode 100644 index 00000000..7e39189f Binary files /dev/null and b/vignettes/v5_ASOIAF_files/figure-html/unnamed-chunk-5-2.png differ diff --git a/vignettes/v5_ASOIAF_files/figure-html/unnamed-chunk-5-3.png b/vignettes/v5_ASOIAF_files/figure-html/unnamed-chunk-5-3.png new file mode 100644 index 00000000..c2558546 Binary files /dev/null and b/vignettes/v5_ASOIAF_files/figure-html/unnamed-chunk-5-3.png differ diff --git a/vignettes/v6_pedigree_model_fitting.Rmd b/vignettes/v6_pedigree_model_fitting.Rmd new file mode 100644 index 00000000..9dbf820b --- /dev/null +++ b/vignettes/v6_pedigree_model_fitting.Rmd @@ -0,0 +1,560 @@ +--- +title: "Fitting Pedigree-Based Variance Component Models" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Fitting Pedigree-Based Variance Component Models} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +options(rmarkdown.html_vignette.check_title = FALSE) +``` + +# Introduction + +A core goal of behavior genetics is to decompose observed phenotypic variance into genetic and environmental components. Traditionally, this has been done using twin studies, which compare monozygotic (MZ) and dizygotic (DZ) twin pairs. However, extended pedigrees -- multi-generational families with known parentage -- provide richer information about relatedness and allow researchers to estimate a wider array of variance components. + +The `BGmisc` package provides a complete pipeline for pedigree-based variance component modeling: + +1. **Simulate** multi-generational pedigrees with `simulatePedigree()` +2. **Compute** relatedness matrices with `ped2add()`, `ped2cn()`, `ped2mit()`, and `ped2ce()` +3. **Check identification** with `identifyComponentModel()` and `comp2vech()` +4. **Build and fit** structural equation models with `buildOneFamilyGroup()`, `buildPedigreeMx()`, and `fitPedigreeModel()` + +This vignette walks through each step, from generating a pedigree to fitting a variance component model and interpreting the results. + +## Prerequisites + +This vignette requires the `OpenMx` package for structural equation modeling. If you don't have it installed, you can install it from CRAN or from the OpenMx website. + +```{r setup} +library(BGmisc) + +has_openmx <- requireNamespace("OpenMx", quietly = TRUE) +has_mvtnorm <- requireNamespace("mvtnorm", quietly = TRUE) + +if (has_openmx) { + library(OpenMx) +} else { + message( + "OpenMx is not installed. Code will be shown but not executed.\n", + "Install OpenMx with: install.packages('OpenMx')" + ) +} + +if (!has_mvtnorm) { + message( + "mvtnorm is not installed. Data simulation examples will not run.\n", + "Install mvtnorm with: install.packages('mvtnorm')" + ) +} else { + library(mvtnorm) +} + +run_models <- has_openmx && has_mvtnorm +``` + + +# Step 1: Simulate a Pedigree + +We begin by simulating a multi-generational pedigree using `simulatePedigree()`. This creates a balanced family structure with a specified number of generations and children per couple. + +```{r simulate-pedigree} +set.seed(421) + +ped <- simulatePedigree( + kpc = 3, # 3 children per couple + Ngen = 4, # 4 generations + sexR = 0.5, # equal sex ratio + marR = 0.6 # 60% mating rate +) + +head(ped) +``` + +The resulting data frame contains one row per individual with columns for family ID (`fam`), personal ID (`ID`), generation (`gen`), parent IDs (`dadID`, `momID`), spouse ID (`spID`), and biological sex (`sex`). + +- Number of individuals: ``r nrow(ped)`` +- Number of generations: ``r length(unique(ped$gen))`` + +```{r ped-summary} +summarizeFamilies(ped, famID = "fam")$family_summary +``` + + +# Step 2: Compute Relatedness Matrices + +With a pedigree in hand, we compute the various relatedness component matrices. Each matrix is square and symmetric, with rows and columns corresponding to individuals in the pedigree. The entry in row *i* and column *j* quantifies the relatedness between person *i* and person *j* along a specific pathway. + +## Additive Genetic Relatedness + +The additive genetic relatedness matrix captures the expected proportion of nuclear DNA shared identical by descent (IBD) between two individuals. For example, parent-offspring pairs share 0.5, full siblings share 0.5 on average, half-siblings share 0.25, and so on. + +```{r compute-additive} +add_matrix <- ped2add(ped, sparse = FALSE) +add_matrix[1:5, 1:5] +``` + +## Common Nuclear Environment + +The common nuclear environment matrix captures whether two individuals shared both biological parents (i.e., were raised in the same nuclear family). Full siblings who share the same mother and father have a value of 1; all others have 0. + +```{r compute-nuclear} +cn_matrix <- ped2cn(ped, sparse = FALSE) +cn_matrix[1:5, 1:5] +``` + +## Mitochondrial Relatedness + +The mitochondrial relatedness matrix captures shared maternal lineage. Individuals who share the same maternal line (mother, maternal grandmother, etc.) share mitochondrial DNA. + +```{r compute-mito} +mt_matrix <- ped2mit(ped, sparse = FALSE) +mt_matrix[1:5, 1:5] +``` + +## Common Extended Family Environment + +The common extended family environment matrix indicates whether individuals belong to the same extended family. For a single pedigree, this is simply a matrix of ones. + +```{r compute-extended} +ce_matrix <- ped2ce(ped) +ce_matrix[1:5, 1:5] +``` + + +# Step 3: Check Model Identification + +Before fitting a model, we need to verify that the variance components are *identified* -- that is, the data provide enough information to uniquely estimate each parameter. If components are not identified, their estimates can trade off against each other, leading to unstable or meaningless results. + +The `identifyComponentModel()` function checks identification by vectorizing each relatedness component matrix (via `comp2vech()`) and testing whether the resulting design matrix has full column rank. Each component matrix becomes a column in this design matrix. If the rank equals the number of components, the model is identified. + +For more background on identification in variance component models, see `vignette("v1_modelingvariancecomponents", package = "BGmisc")`. + +## Checking Our Full Model + +We plan to fit a 5-component model with additive genetic (A), common nuclear environment (Cn), common extended environment (Ce), mitochondrial (Mt), and unique environment (E) components. Let's check whether these five components are simultaneously identified using the relatedness matrices we just computed: + +```{r identify-full-model} +id_full <- identifyComponentModel( + A = add_matrix, + Cn = cn_matrix, + Ce = ce_matrix, + Mt = mt_matrix, + E = diag(1, nrow(add_matrix)) +) +id_full +``` +```{r, include=FALSE} +identified <- id_full$identified + +identified_text <- "The full model is identified. We can proceed to fit it. However, to illustrate the process of checking identification and refining the model, we will also show how to interpret the details of the identification check. Below, I have provided an unidentified model to demonstrate how to use the `nidp` element of the result to understand which components are confounded." + +not_identified_text <- "The full model is NOT identified. We will need to refine the model by dropping or fixing some components." + +``` + +`r if (identified) paste(identified_text) else not_identified_text` + +```{r identify-full-model-details, include = identified} +# show if model is identified + +identifyComponentModel( + A = add_matrix, + A2 = add_matrix, + Cn = cn_matrix, + Ce = ce_matrix, + Mt = mt_matrix, + E = diag(1, nrow(add_matrix)) +) + +``` + +With a single pedigree, the 5-component model *may* not be fully identified. The `nidp` element in the result tells us which components are confounded. This is because some relatedness matrices can be linearly dependent -- for example, `ce_matrix` is a matrix of all ones for a single family, and the identity matrix (E) plus other components may span a similar space. In this case, our model is identified, but if it were not, we would see a message like "Variance components are not all identified. (And even if they were, we might not have enough data to estimate them all.) + +## Narrowing Down to an Identified Model + +Based on the identification check above, we can drop or fix the non-identified components. A natural choice is to remove the components flagged by `identifyComponentModel()` and re-check: + +```{r identify-reduced} +# A simpler model: A + Cn + E +id_ace <- identifyComponentModel( + A = list(add_matrix), + Cn = list(cn_matrix), + E = diag(1, nrow(add_matrix)) +) +id_ace +``` + +A single extended pedigree typically provides enough variation in relatedness coefficients (parent-offspring = 0.5, siblings = 0.5, grandparent-grandchild = 0.25, cousins = 0.125, etc.) to identify the A + Cn + E model. + +## Recovering Identification with Multiple Families + +When a component is not identified with one family, adding families with different structures can help. Each family contributes additional rows to the design matrix. Let's check whether the full 5-component model becomes identified when we combine two pedigrees: + +```{r identify-multi} +set.seed(99) +ped2 <- simulatePedigree(kpc = 4, Ngen = 3, marR = 0.5) |> makeTwins() +add2 <- ped2add(ped2, sparse = FALSE) +cn2 <- ped2cn(ped2, sparse = FALSE) +ce2 <- ped2ce(ped2) +mt2 <- ped2mit(ped2, sparse = FALSE) + +# Check the full model across two families +n_combined <- nrow(add_matrix) + nrow(add2) +id_two_fam <- identifyComponentModel( + A = list(add_matrix, add2), + Cn = list(cn_matrix, cn2), + Ce = list(ce_matrix, ce2), + Mt = list(mt_matrix, mt2), + E = diag(1, n_combined) +) +id_two_fam +``` + +This result guides our modeling decisions in the steps that follow. When fitting the model below, we set the non-identified components' true values to zero so that we have a known ground truth to recover. + + +# Step 4: Simulate Phenotypic Data + +Before fitting a model, we need observed data. In practice, this would be measured phenotypes (e.g., height, cognitive ability, disease status). Here, we simulate phenotypic data from a known variance component structure so we can verify that our model recovers the true parameters. + +We define "true" variance components and use the relatedness matrices to construct the population covariance matrix, then sample from it. + +```{r simulate-phenotype, eval = has_mvtnorm} +# True variance components (proportions of total variance) +true_var <- list( + ad2 = 0.50, # additive genetic + cn2 = 0.10, # common nuclear environment + ce2 = 0.00, # common extended environment (set to 0 for identifiability) + mt2 = 0.00, # mitochondrial (set to 0 for simplicity) + ee2 = 0.40 # unique environment (residual) +) + +# Build the implied covariance matrix +# V = ad2*A + cn2*Cn + ce2*U + mt2*Mt + ee2*I +n <- nrow(add_matrix) +I_mat <- diag(1, n) +U_mat <- matrix(1, n, n) + +V_true <- true_var$ad2 * add_matrix + + true_var$cn2 * cn_matrix + + true_var$ce2 * U_mat + + true_var$mt2 * mt_matrix + + true_var$ee2 * I_mat + +# Simulate one realization of the phenotype vector +set.seed(123) +y <- mvtnorm::rmvnorm(1, sigma = V_true) + + +# Create named variable labels (required by OpenMx) +ytemp <- paste("S", rownames(add_matrix)) +``` + +```{r show-phenotype} + +if (!exists("y")) { + y <- rep(NA, nrow(add_matrix)) +} + +``` + + +We simulated phenotypic data for`r ncol(y)` individuals, with a mean of `r round(mean(y), 3)` and a standard deviation of `r round(sd(y), 3)`. The variance in this simulated phenotype arises from the specified genetic and environmental components according to the covariance structure we defined. + +In practice, you would have data from multiple independently ascertained families. Here we simulate data from a single pedigree for simplicity, but the model-fitting functions support multiple pedigrees (shown in a later section). + + +# Step 5: Build the OpenMx Model + +The `BGmisc` package provides helper functions that construct the OpenMx model in three layers: + +1. **`buildPedigreeModelCovariance()`** -- Creates the variance component parameters (free parameters to be estimated) +2. **`buildOneFamilyGroup()`** -- Creates the model for a single family, embedding the relatedness matrices and observed data +3. **`buildPedigreeMx()`** -- Combines the variance components with one or more family groups into a multi-group model + +## Building a Single-Family Model + +Let's walk through building the model step by step. + +### Variance Component Parameters + +The `buildPedigreeModelCovariance()` function creates the OpenMx sub-model that holds the free variance component parameters. You can control which components to include: + +```{r build-covariance, eval = run_models} +# Starting values for the optimizer +start_vars <- list( + ad2 = 0.3, dd2 = 0, cn2 = 0.1, + ce2 = 0.1, mt2 = 0.1, am2 = 0, + ee2 = 0.4 +) + +# Build variance component sub-model +vc_model <- buildPedigreeModelCovariance( + vars = start_vars, + Vad = TRUE, # estimate additive genetic variance + Vdd = FALSE, # do not estimate dominance + Vcn = TRUE, # estimate common nuclear environment + Vce = TRUE, # estimate common extended environment + Vmt = TRUE, # estimate mitochondrial + Vam = FALSE, # do not estimate A x Mt interaction + Ver = TRUE # estimate unique environment +) +vc_model + +summary(vc_model) +``` + +### Family Group Model + +The `buildOneFamilyGroup()` function constructs the model for one family. It takes the relatedness matrices and the observed data for that family: + +```{r build-group, eval = run_models} +# Format the observed data as a 1-row matrix with named columns +obs_data <- matrix(y, nrow = 1, dimnames = list(NULL, ytemp)) + +# Build the family group model +family_group <- buildOneFamilyGroup( + group_name = "family1", + Addmat = add_matrix, + Nucmat = cn_matrix, + Extmat = ce_matrix, + Mtdmat = mt_matrix, + full_df_row = obs_data, + ytemp = ytemp +) +``` + +The family group model contains: + +- **Identity matrix (I)** and **unit matrix (U)** for the unique and extended environment components +- **Relatedness matrices** (A, Cn, Mt, etc.) as fixed data matrices +- **An mxAlgebra** that computes the implied covariance: $V = \sigma^2_a A + \sigma^2_{cn} C_n + \sigma^2_{ce} U + \sigma^2_{mt} Mt + \sigma^2_e I$ +- **Normal expectation** with the covariance and a free mean + + +### Assembling the Full Model + +The `buildPedigreeMx()` function combines the variance component parameters (shared across all families) with the family group model(s): + +```{r build-full, eval = run_models} +full_model <- buildPedigreeMx( + model_name = "PedigreeVCModel", + vars = start_vars, + group_models = list(family_group) +) +full_model$submodels +``` + + +# Step 6: Fit the Model + +With the model assembled, we fit it using OpenMx's optimizer. The `mxRun()` function performs maximum likelihood estimation: + +```{r fit-model, eval = run_models} +fitted_model <- mxRun(full_model) +smr <- summary(fitted_model) +``` + +```{r show-results, eval = run_models} +# Extract variance component estimates +estimates <- c( + Vad = fitted_model$ModelOne$Vad$values[1, 1], + Vcn = fitted_model$ModelOne$Vcn$values[1, 1], + Vce = fitted_model$ModelOne$Vce$values[1, 1], + Vmt = fitted_model$ModelOne$Vmt$values[1, 1], + Ver = fitted_model$ModelOne$Ver$values[1, 1] +) + +# Compare estimates to true values +truth <- c( + Vad = true_var$ad2, + Vcn = true_var$cn2, + Vce = true_var$ce2, + Vmt = true_var$mt2, + Ver = true_var$ee2 +) + +comparison <- data.frame( + Component = names(truth), + True = truth, + Estimated = round(estimates, 4) +) +comparison +``` + +```{r show-fit-stats, eval = run_models} +cat("-2 Log Likelihood:", smr$Minus2LogLikelihood, "\n") +cat("Converged:", fitted_model$output$status$code == 0, "\n") +``` + +With a single pedigree realization, estimates will vary from the true values due to sampling variability. Estimation improves substantially with multiple families, as shown next. + + +# Step 7: Multi-Pedigree Model + +In practice, researchers have data from multiple families. The BGmisc helpers are designed for this multi-group scenario, where variance component parameters are shared across families but each family has its own relatedness structure and data. + +## Simulating Multiple Families + +```{r multi-ped-simulate, eval = run_models} +set.seed(2024) +n_families <- 5 + +# Storage +ped_list <- vector("list", n_families) +add_list <- vector("list", n_families) +cn_list <- vector("list", n_families) +mt_list <- vector("list", n_families) +ce_list <- vector("list", n_families) +y_list <- vector("list", n_families) +ytemp_list <- vector("list", n_families) + +for (i in seq_len(n_families)) { + # Simulate each family with slightly different structure + ped_i <- simulatePedigree(kpc = 3, Ngen = 4, marR = 0.6) + ped_list[[i]] <- ped_i + + # Compute relatedness matrices + A_i <- as.matrix(ped2add(ped_i)) + Cn_i <- as.matrix(ped2cn(ped_i)) + Mt_i <- as.matrix(ped2mit(ped_i)) + Ce_i <- ped2ce(ped_i) + n_i <- nrow(A_i) + + add_list[[i]] <- A_i + cn_list[[i]] <- Cn_i + mt_list[[i]] <- Mt_i + ce_list[[i]] <- Ce_i + + # Build implied covariance and simulate data + I_i <- diag(1, n_i) + U_i <- matrix(1, n_i, n_i) + V_i <- true_var$ad2 * A_i + + true_var$cn2 * Cn_i + + true_var$ce2 * U_i + + true_var$mt2 * Mt_i + + true_var$ee2 * I_i + + y_list[[i]] <- mvtnorm::rmvnorm(1, sigma = V_i) + ytemp_list[[i]] <- paste("S", rownames(A_i)) +} + +cat("Simulated", n_families, "families\n") +cat("Family sizes:", vapply(ped_list, nrow, integer(1)), "\n") +``` + +## Building and Fitting the Multi-Group Model + +We build a group model for each family and then combine them: + +```{r multi-ped-fit, eval = run_models} +# Build group models for each family +group_models <- lapply(seq_len(n_families), function(i) { + obs_data_i <- matrix(y_list[[i]], nrow = 1, dimnames = list(NULL, ytemp_list[[i]])) + + buildOneFamilyGroup( + group_name = paste0("ped", i), + Addmat = add_list[[i]], + Nucmat = cn_list[[i]], + Extmat = ce_list[[i]], + Mtdmat = mt_list[[i]], + full_df_row = obs_data_i, + ytemp = ytemp_list[[i]] + ) +}) + +# Build the multi-group model +multi_model <- buildPedigreeMx( + model_name = "MultiPedigreeModel", + vars = start_vars, + group_models = group_models +) + +# Fit the model +fitted_multi <- mxRun(multi_model) +smr_multi <- summary(fitted_multi) +``` + +```{r multi-ped-results, eval = run_models} +# Extract and compare estimates +estimates_multi <- c( + Vad = fitted_multi$ModelOne$Vad$values[1, 1], + Vcn = fitted_multi$ModelOne$Vcn$values[1, 1], + Vce = fitted_multi$ModelOne$Vce$values[1, 1], + Vmt = fitted_multi$ModelOne$Vmt$values[1, 1], + Ver = fitted_multi$ModelOne$Ver$values[1, 1] +) + +comparison_multi <- data.frame( + Component = c("Additive genetic (Vad)", "Common nuclear (Vcn)", + "Common extended (Vce)", "Mitochondrial (Vmt)", + "Unique environment (Ver)"), + True = truth, + Estimated = round(estimates_multi, 4) +) +comparison_multi + +cat("\n-2 Log Likelihood:", smr_multi$Minus2LogLikelihood, "\n") +cat("Converged:", fitted_multi$output$status$code == 0, "\n") +``` + +With multiple families providing more information, the estimates should more closely approximate the true generating values. + + +# Using the High-Level `fitPedigreeModel()` Wrapper + +For convenience, `fitPedigreeModel()` wraps the build and fit steps together. It accepts pre-built group models and uses `mxTryHard()` for robust optimization with multiple starts: + +```{r fit-highlevel, eval = run_models} +fitted_easy <- fitPedigreeModel( + model_name = "EasyFit", + vars = start_vars, + data = NULL, + group_models = group_models, + tryhard = TRUE +) + +summary(fitted_easy) +``` + + +# Understanding the Covariance Structure + +The key equation underlying the model is: + +$$V = \sigma^2_a \mathbf{A} + \sigma^2_{cn} \mathbf{C}_n + \sigma^2_{ce} \mathbf{U} + \sigma^2_{mt} \mathbf{M} + \sigma^2_e \mathbf{I}$$ + +where: + +- $\mathbf{A}$ is the additive genetic relatedness matrix (from `ped2add()`) +- $\mathbf{C}_n$ is the common nuclear environment matrix (from `ped2cn()`) +- $\mathbf{U}$ is a matrix of ones representing shared extended family environment (from `ped2ce()`) +- $\mathbf{M}$ is the mitochondrial relatedness matrix (from `ped2mit()`) +- $\mathbf{I}$ is the identity matrix (unique environment, one per person) +- $\sigma^2_a, \sigma^2_{cn}, \sigma^2_{ce}, \sigma^2_{mt}, \sigma^2_e$ are the variance components to be estimated + +This is an extension of the classical twin model to arbitrary pedigree structures. The additive genetic relatedness matrix generalizes the concept of MZ twins sharing 100% of genes and DZ twins sharing 50% -- in a pedigree, every pair of relatives has a specific coefficient of relatedness determined by their genealogical connection. + + +# Summary + +This vignette demonstrated the full workflow for pedigree-based variance component modeling: + +| Step | Function | Purpose | +|------|----------|---------| +| 1 | `simulatePedigree()` | Generate a multi-generational pedigree | +| 2 | `ped2add()`, `ped2cn()`, `ped2mit()`, `ped2ce()` | Compute relatedness matrices | +| 3 | `identifyComponentModel()`| Check model identification | +| 4 | Simulate or prepare phenotypic data | Observed data for model fitting | +| 5 | `buildOneFamilyGroup()`, `buildPedigreeModelCovariance()` | Build OpenMx model components | +| 6 | `buildPedigreeMx()`, `mxRun()` or `fitPedigreeModel()` | Assemble and fit the model | +| 7 | Multiple families | Scale to multi-group pedigree models | + +The helper functions (`buildPedigreeModelCovariance()`, `buildOneFamilyGroup()`, `buildFamilyGroups()`, `buildPedigreeMx()`, `fitPedigreeModel()`) handle the mechanics of translating pedigree relatedness matrices into proper OpenMx model specifications, allowing researchers to focus on the substantive questions rather than the modeling boilerplate.