Skip to content

Commit

Permalink
all_words gains char.keep and char2space arguments to enable r…
Browse files Browse the repository at this point in the history
…etention

  of characters and multi word phrases.  These features are passed to
  `freq_terms` as well.  Suggestd by stackoverflow's lawyeR
  (http://stackoverflow.com/a/26162401/1000343).
  • Loading branch information
trinker committed Oct 2, 2014
1 parent a65b028 commit 4e0c8f8
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 32 deletions.
5 changes: 5 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ MINOR FEATURES

IMPROVEMENTS

`all_words` gains `char.keep` and `char2space` arguments to enable retention
of characters and multi word phrases. These features are passed to
`freq_terms` as well. Suggestd by stackoverflow's lawyeR
(http://stackoverflow.com/a/26162401/1000343).

CHANGES

* `rm_url` has been moved into its own canned regex pattern extraction/replacer
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ And constructed with the following guidelines:

**IMPROVEMENTS**

`all_words` gains `char.keep` and `char2space` arguments to enable retention
of characters and multi word phrases. These features are passed to
`freq_terms` as well. Suggestd by stackoverflow's lawyeR
(<a href="http://stackoverflow.com/a/26162401/1000343)." target="_blank">http://stackoverflow.com/a/26162401/1000343).</a>

**CHANGES**

* `rm_url` has been moved into its own canned regex pattern extraction/replacer
Expand Down
78 changes: 47 additions & 31 deletions R/all_words.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
#' \code{FALSE} orders the rows by descending frequency.
#' @param apostrophe.remove logical. If \code{TRUE} removes apostrophes from
#' the text before examining.
#' @param char.keep A character vector of symbol character (i.e., punctuation)
#' that strip should keep. The default is to strip everything except
#' apostrophes. This enables the use of special characters to be turned into
#' spaces or for characters to be retained.
#' @param char2space A vector of characters to be turned into spaces.
#' @param \ldots Other argument supplied to \code{\link[qdap]{strip}}.
#' @return Returns a dataframe with frequency counts of words that begin with or
#' contain the provided word chunk.
Expand All @@ -39,35 +44,47 @@
#'
#' ## Filter by nchar and stopwords
#' Filter(head(x3), min = 3)
#'
#' ## Keep spaces
#' all_words(space_fill(DATA$state, c("are you", "can be")))
#' }
all_words <-
function(text.var, begins.with = NULL, contains = NULL, alphabetical = TRUE, apostrophe.remove = FALSE, ...){
if (!is.null(begins.with) & !is.null(contains)) {
stop("Can not use both 'begins.with' & 'contains' arguments")
}
if(!is.null(begins.with)) begins.with <- tolower(begins.with)
if(!is.null(contains)) contains <- tolower(contains)
WORDS <- unlist(bag_o_words(strip(text.var, apostrophe.remove = apostrophe.remove, ...)))
names(WORDS) <- NULL
y <- data.frame(table(WORDS), stringsAsFactors = FALSE, row.names=NULL)
names(y) <- c("WORD", "FREQ")
y$WORD <- as.character(y$WORD)
y[, "FREQ"] <- as.numeric(as.character(y[, "FREQ"]))
if (!is.null(begins.with)) {
y <- y[substring(y[, 1], 1, nchar(begins.with)) %in% begins.with, ]
if(nrow(y)==0) stop("No words match")
}
if (!is.null(contains)) {
y <- y[grep(contains, y[, 1]), ]
if(nrow(y)==0) stop("No words match")
}
if (!alphabetical) {
y <- y[order(-y$FREQ, y$WORD), ]
function(text.var, begins.with = NULL, contains = NULL, alphabetical = TRUE,
apostrophe.remove = FALSE, char.keep = char2space, char2space = "~~", ...){

if (!is.null(begins.with) & !is.null(contains)) {
stop("Can not use both 'begins.with' & 'contains' arguments")
}
if(!is.null(begins.with)) begins.with <- tolower(begins.with)
if(!is.null(contains)) contains <- tolower(contains)

char.keep <- unique(c(char2space, char.keep))

WORDS <- unlist(bag_o_words(text.var, apostrophe.remove = apostrophe.remove,
char.keep, char.keep, ...), use.names=FALSE)

y <- data.frame(table(WORDS), stringsAsFactors = FALSE, row.names=NULL)
names(y) <- c("WORD", "FREQ")
y$WORD <- as.character(y$WORD)
y[, "FREQ"] <- as.numeric(as.character(y[, "FREQ"]))
if (!is.null(begins.with)) {
y <- y[substring(y[, 1], 1, nchar(begins.with)) %in% begins.with, ]
if(nrow(y)==0) stop("No words match")
}
if (!is.null(contains)) {
y <- y[grep(contains, y[, 1]), ]
if(nrow(y)==0) stop("No words match")
}
if (!alphabetical) {
y <- y[order(-y$FREQ, y$WORD), ]
}
if (!is.null(char2space)) {
y[["WORD"]] <- mgsub(char2space, " ", y[["WORD"]])
}
p <- class(y)
class(y) <- c("all_words", p)
y
}
p <- class(y)
class(y) <- c("all_words", p)
y
}

#' Prints an all_words Object
#'
Expand All @@ -93,12 +110,11 @@ print.all_words <- function(x, ...) {
#' @method Filter all_words
#' @return \code{Filter.all_words} - Returns a matrix of the class "all_words".
Filter.all_words <- function(x, min = 1, max = Inf,
count.apostrophe = TRUE, stopwords = NULL, ignore.case = TRUE, ...) {

count.apostrophe = TRUE, stopwords = NULL, ignore.case = TRUE, ...) {
word_list_filter_helper(x, min = min, max = max,
count.apostrophe = count.apostrophe, stopwords = stopwords,
ignore.case = ignore.case, ...)
count.apostrophe = count.apostrophe, stopwords = stopwords,
ignore.case = ignore.case, ...)
}



3 changes: 3 additions & 0 deletions R/freq_terms.R
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@
#' plot(out[[i]], plot=FALSE) + ggtitle(names(out)[i])
#' })
#' dev.off()
#'
#' ## Keep spaces
#' freq_terms(space_fill(DATA$state, "are you"), 500, char.keep="~~")
#' }
freq_terms <-
function(text.var, top = 20, at.least = 1, stopwords = NULL,
Expand Down
13 changes: 12 additions & 1 deletion man/all_words.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
\title{Searches Text Column for Words}
\usage{
all_words(text.var, begins.with = NULL, contains = NULL,
alphabetical = TRUE, apostrophe.remove = FALSE, ...)
alphabetical = TRUE, apostrophe.remove = FALSE, char.keep = char2space,
char2space = "~~", ...)
}
\arguments{
\item{text.var}{The text variable.}
Expand All @@ -21,6 +22,13 @@ Use this if searching for a word containing the word chunk.}
\item{apostrophe.remove}{logical. If \code{TRUE} removes apostrophes from
the text before examining.}

\item{char.keep}{A character vector of symbol character (i.e., punctuation)
that strip should keep. The default is to strip everything except
apostrophes. This enables the use of special characters to be turned into
spaces or for characters to be retained.}

\item{char2space}{A vector of characters to be turned into spaces.}

\item{\ldots}{Other argument supplied to \code{\link[qdap]{strip}}.}
}
\value{
Expand Down Expand Up @@ -54,6 +62,9 @@ head(x5)

## Filter by nchar and stopwords
Filter(head(x3), min = 3)

## Keep spaces
all_words(space_fill(DATA$state, c("are you", "can be")))
}
}
\seealso{
Expand Down
3 changes: 3 additions & 0 deletions man/freq_terms.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ lapply(seq_along(out), function(i) {
plot(out[[i]], plot=FALSE) + ggtitle(names(out)[i])
})
dev.off()
## Keep spaces
freq_terms(space_fill(DATA$state, "are you"), 500, char.keep="~~")
}
}
\seealso{
Expand Down

0 comments on commit 4e0c8f8

Please sign in to comment.