From 3bcdb0918043dba16b7ba8011dd80f8f798c6002 Mon Sep 17 00:00:00 2001 From: davharris Date: Mon, 31 Mar 2014 17:44:56 -0700 Subject: [PATCH 1/2] add spelling.R --- DESCRIPTION | 3 ++- R/spelling.R | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 R/spelling.R diff --git a/DESCRIPTION b/DESCRIPTION index 0e38480..75abf1b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,7 +17,8 @@ Imports: whisker, lubridate, stringr, - outliers + outliers, + stringdist Suggests: testthat, xtable diff --git a/R/spelling.R b/R/spelling.R new file mode 100644 index 0000000..c5a804c --- /dev/null +++ b/R/spelling.R @@ -0,0 +1,39 @@ +detect_cap = function(char){ + uniques = unique(char) + lower_uniques = tolower(uniques) + + matches = names(which(table(lower_uniques) > 1)) + + + if(length(matches) > 0){ + message("The following sets of elements are identical except for capitalization") + lapply( + matches, + function(x){ + uniques[x == lower_uniques] + } + ) + } +} + + +# I defaulted to Jaccard similarity because it's pre-scaled. It's probably not +# the best distance metric for this though! +detect_similar = function(char, method = "jaccard", tol = 0.1, ...){ + uniques = unique(char) + distmat = stringdistmatrix(uniques, uniques, method = method, ...) + + indices = which(distmat < tol & upper.tri(distmat)) + + message("The following pairs of elements were uncomfortably similar") + + # Could do something fancy like grouping them together (not just pairs) & + # indicating which name is more common. + + #Column names aren't meaningful + data.frame( + a = uniques[row(distmat)[indices]], + b = uniques[col(distmat)[indices]] + ) + +} From fff29e5d7b854fb13195a0bfd9fdfcc29e9071b4 Mon Sep 17 00:00:00 2001 From: davharris Date: Mon, 31 Mar 2014 17:49:47 -0700 Subject: [PATCH 2/2] documentation --- R/spelling.R | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/R/spelling.R b/R/spelling.R index c5a804c..b402756 100644 --- a/R/spelling.R +++ b/R/spelling.R @@ -1,3 +1,6 @@ +#' Detect possible capitalization errors +#' @param char a character vector +#' @export detect_cap = function(char){ uniques = unique(char) lower_uniques = tolower(uniques) @@ -16,14 +19,20 @@ detect_cap = function(char){ } } - -# I defaulted to Jaccard similarity because it's pre-scaled. It's probably not -# the best distance metric for this though! +#' Detect uncomfortably similar string pairs +#' @param char a character vector +#' @param method a valid \code{method} for \code{stringdistmat} +#' @param tol minimum allowable distance between values +#' @param ... additional arguments to \code{stringdistmat} +#' I defaulted to Jaccard similarity because it's pre-scaled. It's probably not +#' the best distance metric for this though! +#' @importFrom stringdist stringdistmat +#' @export detect_similar = function(char, method = "jaccard", tol = 0.1, ...){ uniques = unique(char) distmat = stringdistmatrix(uniques, uniques, method = method, ...) - indices = which(distmat < tol & upper.tri(distmat)) + indices = which(distmat <= tol & upper.tri(distmat)) message("The following pairs of elements were uncomfortably similar")