From 3bcdb0918043dba16b7ba8011dd80f8f798c6002 Mon Sep 17 00:00:00 2001
From: davharris <harry491@gmail.com>
Date: Mon, 31 Mar 2014 17:44:56 -0700
Subject: [PATCH 1/2] add spelling.R

---
 DESCRIPTION  |  3 ++-
 R/spelling.R | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)
 create mode 100644 R/spelling.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 0e38480..75abf1b 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -17,7 +17,8 @@ Imports:
     whisker,
     lubridate,
     stringr,
-    outliers
+    outliers,
+    stringdist
 Suggests:
     testthat,
     xtable
diff --git a/R/spelling.R b/R/spelling.R
new file mode 100644
index 0000000..c5a804c
--- /dev/null
+++ b/R/spelling.R
@@ -0,0 +1,39 @@
+detect_cap = function(char){
+  uniques = unique(char)
+  lower_uniques = tolower(uniques)
+  
+  matches = names(which(table(lower_uniques) > 1))
+  
+  
+  if(length(matches) > 0){
+    message("The following sets of elements are identical except for capitalization")
+    lapply(
+      matches, 
+      function(x){
+        uniques[x == lower_uniques]
+      }
+    )
+  }
+}
+
+
+# I defaulted to Jaccard similarity because it's pre-scaled.  It's probably not
+# the best distance metric for this though!
+detect_similar = function(char, method = "jaccard", tol = 0.1, ...){
+  uniques = unique(char)
+  distmat = stringdistmatrix(uniques, uniques, method = method, ...)
+  
+  indices = which(distmat < tol & upper.tri(distmat))
+  
+  message("The following pairs of elements were uncomfortably similar")
+  
+  # Could do something fancy like grouping them together (not just pairs) & 
+  # indicating which name is more common.
+  
+  #Column names aren't meaningful
+  data.frame(
+    a = uniques[row(distmat)[indices]], 
+    b = uniques[col(distmat)[indices]]
+  )
+  
+}

From fff29e5d7b854fb13195a0bfd9fdfcc29e9071b4 Mon Sep 17 00:00:00 2001
From: davharris <harry491@gmail.com>
Date: Mon, 31 Mar 2014 17:49:47 -0700
Subject: [PATCH 2/2] documentation

---
 R/spelling.R | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/R/spelling.R b/R/spelling.R
index c5a804c..b402756 100644
--- a/R/spelling.R
+++ b/R/spelling.R
@@ -1,3 +1,6 @@
+#' Detect possible capitalization errors
+#' @param char a character vector
+#' @export
 detect_cap = function(char){
   uniques = unique(char)
   lower_uniques = tolower(uniques)
@@ -16,14 +19,20 @@ detect_cap = function(char){
   }
 }
 
-
-# I defaulted to Jaccard similarity because it's pre-scaled.  It's probably not
-# the best distance metric for this though!
+#' Detect uncomfortably similar string pairs
+#' @param char a character vector
+#' @param method a valid \code{method} for \code{stringdistmat}
+#' @param tol minimum allowable distance between values
+#' @param ... additional arguments to \code{stringdistmat}
+#' I defaulted to Jaccard similarity because it's pre-scaled.  It's probably not
+#' the best distance metric for this though!
+#' @importFrom stringdist stringdistmat
+#' @export
 detect_similar = function(char, method = "jaccard", tol = 0.1, ...){
   uniques = unique(char)
   distmat = stringdistmatrix(uniques, uniques, method = method, ...)
   
-  indices = which(distmat < tol & upper.tri(distmat))
+  indices = which(distmat <= tol & upper.tri(distmat))
   
   message("The following pairs of elements were uncomfortably similar")