all_words gains char.keep and char2space arguments to enable r…

…etention of characters and multi word phrases. These features are passed to `freq_terms` as well. Suggestd by stackoverflow's lawyeR (http://stackoverflow.com/a/26162401/1000343).
trinker · Oct 2, 2014 · 4e0c8f8 · 4e0c8f8
1 parent a65b028
commit 4e0c8f8
Show file tree

Hide file tree

Showing 6 changed files with 75 additions and 32 deletions.
diff --git a/NEWS b/NEWS
@@ -44,6 +44,11 @@ MINOR FEATURES
 
 IMPROVEMENTS
 
+  `all_words` gains `char.keep` and `char2space` arguments to enable retention 
+  of characters and multi word phrases.  These features are passed to 
+  `freq_terms` as well.  Suggestd by stackoverflow's lawyeR 
+  (http://stackoverflow.com/a/26162401/1000343).
+
 CHANGES
 
 * `rm_url` has been moved into its own canned regex pattern extraction/replacer

diff --git a/NEWS.md b/NEWS.md
@@ -44,6 +44,11 @@ And constructed with the following guidelines:
 
 **IMPROVEMENTS**
 
+  `all_words` gains `char.keep` and `char2space` arguments to enable retention 
+  of characters and multi word phrases.  These features are passed to 
+  `freq_terms` as well.  Suggestd by stackoverflow's lawyeR 
+  (<a href="http://stackoverflow.com/a/26162401/1000343)." target="_blank">http://stackoverflow.com/a/26162401/1000343).</a>
+
 **CHANGES**
 
 * `rm_url` has been moved into its own canned regex pattern extraction/replacer

diff --git a/R/all_words.R b/R/all_words.R
@@ -13,6 +13,11 @@
 #' \code{FALSE} orders the rows by descending frequency.
 #' @param apostrophe.remove logical.  If \code{TRUE} removes apostrophes from 
 #' the text before examining.
+#' @param char.keep A character vector of symbol character (i.e., punctuation) 
+#' that strip should keep.  The default is to strip everything except 
+#' apostrophes.  This enables the use of special characters to be turned into 
+#' spaces or for characters to be retained.
+#' @param char2space A vector of characters to be turned into spaces. 
 #' @param \ldots Other argument supplied to \code{\link[qdap]{strip}}.
 #' @return Returns a dataframe with frequency counts of words that begin with or 
 #' contain the provided word chunk.
@@ -39,35 +44,47 @@
 #' 
 #' ## Filter by nchar and stopwords
 #' Filter(head(x3), min = 3)
+#' 
+#' ## Keep spaces
+#' all_words(space_fill(DATA$state, c("are you", "can be")))
 #' }
 all_words <- 
-function(text.var, begins.with = NULL, contains = NULL, alphabetical = TRUE, apostrophe.remove = FALSE, ...){
-    if (!is.null(begins.with) & !is.null(contains)) {
-        stop("Can not use both 'begins.with' & 'contains' arguments")
-    }
-    if(!is.null(begins.with)) begins.with <- tolower(begins.with)
-    if(!is.null(contains)) contains <- tolower(contains)
-    WORDS <- unlist(bag_o_words(strip(text.var, apostrophe.remove = apostrophe.remove, ...)))
-    names(WORDS) <- NULL
-    y <- data.frame(table(WORDS), stringsAsFactors = FALSE, row.names=NULL)
-    names(y) <- c("WORD", "FREQ")
-    y$WORD <- as.character(y$WORD)
-    y[, "FREQ"] <- as.numeric(as.character(y[, "FREQ"]))
-    if (!is.null(begins.with)) {
-        y <- y[substring(y[, 1], 1, nchar(begins.with)) %in% begins.with, ]
-        if(nrow(y)==0) stop("No words match")
-    }
-    if (!is.null(contains)) {
-        y <- y[grep(contains, y[, 1]), ]
-        if(nrow(y)==0) stop("No words match")
-    }
-    if (!alphabetical) {
-        y <- y[order(-y$FREQ, y$WORD), ]
+    function(text.var, begins.with = NULL, contains = NULL, alphabetical = TRUE, 
+             apostrophe.remove = FALSE,  char.keep = char2space, char2space = "~~", ...){
+
+        if (!is.null(begins.with) & !is.null(contains)) {
+            stop("Can not use both 'begins.with' & 'contains' arguments")
+        }
+        if(!is.null(begins.with)) begins.with <- tolower(begins.with)
+        if(!is.null(contains)) contains <- tolower(contains)
+
+        char.keep <- unique(c(char2space, char.keep))
+
+        WORDS <- unlist(bag_o_words(text.var, apostrophe.remove = apostrophe.remove, 
+                                    char.keep, char.keep, ...), use.names=FALSE)
+
+        y <- data.frame(table(WORDS), stringsAsFactors = FALSE, row.names=NULL)
+        names(y) <- c("WORD", "FREQ")
+        y$WORD <- as.character(y$WORD)
+        y[, "FREQ"] <- as.numeric(as.character(y[, "FREQ"]))
+        if (!is.null(begins.with)) {
+            y <- y[substring(y[, 1], 1, nchar(begins.with)) %in% begins.with, ]
+            if(nrow(y)==0) stop("No words match")
+        }
+        if (!is.null(contains)) {
+            y <- y[grep(contains, y[, 1]), ]
+            if(nrow(y)==0) stop("No words match")
+        }
+        if (!alphabetical) {
+            y <- y[order(-y$FREQ, y$WORD), ]
+        }
+        if (!is.null(char2space)) {
+            y[["WORD"]] <- mgsub(char2space, " ", y[["WORD"]])
+        }
+        p <- class(y)
+        class(y) <- c("all_words", p)
+        y
     }
-    p <- class(y)
-    class(y) <- c("all_words", p)
-    y
-}
 
 #' Prints an all_words Object
 #' 
@@ -93,12 +110,11 @@ print.all_words <- function(x, ...) {
 #' @method Filter all_words
 #' @return \code{Filter.all_words} - Returns a matrix of the class "all_words".
 Filter.all_words <- function(x, min = 1, max = Inf, 
-    count.apostrophe = TRUE, stopwords = NULL, ignore.case = TRUE, ...) {
-
+                             count.apostrophe = TRUE, stopwords = NULL, ignore.case = TRUE, ...) {
+    
     word_list_filter_helper(x, min = min, max = max, 
-        count.apostrophe = count.apostrophe, stopwords = stopwords, 
-        ignore.case = ignore.case, ...)
+                            count.apostrophe = count.apostrophe, stopwords = stopwords, 
+                            ignore.case = ignore.case, ...)
 }
 
 
-
diff --git a/R/freq_terms.R b/R/freq_terms.R
@@ -47,6 +47,9 @@
 #'     plot(out[[i]], plot=FALSE) + ggtitle(names(out)[i])
 #' })
 #' dev.off()
+#' 
+#' ## Keep spaces
+#' freq_terms(space_fill(DATA$state, "are you"), 500, char.keep="~~")
 #' }
 freq_terms <- 
 function(text.var, top = 20, at.least = 1, stopwords = NULL, 

diff --git a/man/all_words.Rd b/man/all_words.Rd
@@ -4,7 +4,8 @@
 \title{Searches Text Column for Words}
 \usage{
 all_words(text.var, begins.with = NULL, contains = NULL,
-  alphabetical = TRUE, apostrophe.remove = FALSE, ...)
+  alphabetical = TRUE, apostrophe.remove = FALSE, char.keep = char2space,
+  char2space = "~~", ...)
 }
 \arguments{
 \item{text.var}{The text variable.}
@@ -21,6 +22,13 @@ Use this if searching for a word containing the word chunk.}
 \item{apostrophe.remove}{logical.  If \code{TRUE} removes apostrophes from
 the text before examining.}
 
+\item{char.keep}{A character vector of symbol character (i.e., punctuation)
+that strip should keep.  The default is to strip everything except
+apostrophes.  This enables the use of special characters to be turned into
+spaces or for characters to be retained.}
+
+\item{char2space}{A vector of characters to be turned into spaces.}
+
 \item{\ldots}{Other argument supplied to \code{\link[qdap]{strip}}.}
 }
 \value{
@@ -54,6 +62,9 @@ head(x5)
 
 ## Filter by nchar and stopwords
 Filter(head(x3), min = 3)
+
+## Keep spaces
+all_words(space_fill(DATA$state, c("are you", "can be")))
 }
 }
 \seealso{

diff --git a/man/freq_terms.Rd b/man/freq_terms.Rd
@@ -59,6 +59,9 @@ lapply(seq_along(out), function(i) {
     plot(out[[i]], plot=FALSE) + ggtitle(names(out)[i])
 })
 dev.off()
+
+## Keep spaces
+freq_terms(space_fill(DATA$state, "are you"), 500, char.keep="~~")
 }
 }
 \seealso{