#' stem the terms in an object
#' 
#' Apply a stemmer to words.  This is a wrapper to \link[SnowballC]{wordStem} 
#' designed to allow this function to be called without loading the entire 
#' \pkg{SnowballC} package.  \link[SnowballC]{wordStem}  uses Martin Porter's 
#' stemming algorithm and the C libstemmer library generated by Snowball.
#' @param x a character, tokens, or dfm object whose word stems are to be
#'   removed.  If tokenized texts, the tokenization must be word-based.
#' @param language the name of a recognized language, as returned by 
#'   \link[SnowballC]{getStemLanguages}, or a two- or three-letter ISO-639 code 
#'   corresponding to one of these languages (see references for the list of 
#'   codes)
#' @seealso \link[SnowballC]{wordStem}
#'   
#' @references \url{http://snowball.tartarus.org/}
#'   
#'   \url{http://www.iso.org/iso/home/standards/language_codes.htm} for the 
#'   ISO-639 language codes
#' @export
#' @import stringi
#' @return \code{tokens_wordstem} returns a \link{tokens} object whose word
#'   types have been stemmed.
#' @examples
#' # example applied to tokens
#' txt <- c(one = "eating eater eaters eats ate",
#'          two = "taxing taxes taxed my tax return")
#' th <- tokens(txt)
#' tokens_wordstem(th)
#' 
tokens_wordstem <- function(x, language = "porter") {
    
    # call the old type of tokenizedTexts if that is what is passed
    if (is.tokenizedTexts(x) & !is.tokens(x))
        return(tokenizedTexts_wordstem(x, language = language))
    
    if (!is.tokens(x))
        stop("x must be a tokens object")
    
    if (identical(as.integer(attributes(x)$ngrams), 1L))
        types(x) <- char_wordstem(types(x), language = language)
    else 
        types(x) <- wordstem_Ngrams(types(x), 
                                    concatenator = attributes(x)$concatenator, 
                                    language = language)
    
    tokens_hashed_recompile(x)
}

tokenizedTexts_wordstem <- function(x, language = "porter") {
    origAttrs <- attributes(x)
    if (!grepl("word", attr(x, "what")) || any(unlist(lapply(x, function(y) stringi::stri_detect_fixed(y, " ") & !is.na(y)))))
        stop("whitespace detected: you can only stem word-tokenized texts")
    if (all.equal(attributes(x)$ngrams, 1))
        result <- lapply(x, SnowballC::wordStem, language)
    else {
        # catm("Ngrams wordstem\n")
        result <- wordstem_Ngrams(x, attributes(x)$concatenator, language)
    }
    class(result) <- c("tokenizedTexts", class(result))
    result[which(is.na(x))] <- NA
    attributes(result) <- origAttrs
    result
}

#' @rdname tokens_wordstem
#' @import stringi 
#' @export
#' @return \code{char_wordstem} returns a \link{character} object whose word
#'   types have been stemmed.
#' @examples
#' # simple example
#' char_wordstem(c("win", "winning", "wins", "won", "winner"))
#' 
char_wordstem <- function(x, language = "porter") {
    if (!is.character(x))
        stop("x must be a character object")
    if (any(stringi::stri_detect_fixed(x, " ") & !is.na(x)))
        stop("whitespace detected: you can only stem tokenized texts")
    result <- SnowballC::wordStem(x, language)
    result[which(is.na(x))] <- NA
    result
}


#' @rdname tokens_wordstem
#' @return \code{dfm_wordstem} returns a \link{dfm} object whose word
#'   types (features) have been stemmed, and recombined to consolidate features made
#'   equivalent because of stemming.
#' @examples 
#' # example applied to a dfm
#' (origdfm <- dfm(txt))
#' dfm_wordstem(origdfm)
#' 
#' @export
dfm_wordstem <- function(x, language = "porter") {
    if (!is.dfm(x))
        stop("x must be a dfm object")
    if (identical(as.integer(x@ngrams), 1L)) 
        colnames(x) <- char_wordstem(featnames(x), language = language)
    else
        colnames(x) <- wordstem_Ngrams(featnames(x), x@concatenator, language)
    dfm_compress(x, margin = "features")
}


###
### internal functions
###

# stemming for ngrams, internal function
wordstem_Ngrams <- function(x, concatenator, language) {
    result <- lapply(x, strsplit, concatenator, fixed = TRUE)
    result <- lapply(result, function(y) lapply(y, SnowballC::wordStem, language = language))
    result <- lapply(result, function(y) vapply(y, paste, character(1), collapse = concatenator))
    # simple way to return a character vector if supplied a character vector
    if (!is.list(x)) result <- unlist(result)
    result
}



###
### Deprecated functions
### 

#' stem words
#' 
#' Deprecated function names to stem words.  See \code{\link{char_wordstem}}, 
#' \code{\link{tokens_wordstem}}, and \code{\link{dfm_wordstem}} instead.
#' @export
#' @keywords internal deprecated
wordstem <- function(x, language = "porter") {
    UseMethod("wordstem")
}

#' @rdname wordstem
#' @export
wordstem.dfm <- function(x, language = "porter") {
    .Deprecated("dfm_wordstem")
    dfm_wordstem(x, language = language)
}

#' @rdname wordstem
#' @export
wordstem.tokens <- function(x, language = "porter") {
    .Deprecated("dfm_tokens")
    tokens_wordstem(x, language = language)
}

#' @rdname wordstem
#' @import stringi 
#' @export
wordstem.tokenizedTexts <- function(x, language = "porter") {
    .Deprecated("tokens_wordstem")
    tokens_wordstem(x, language = language)
}

#' @rdname wordstem
#' @import stringi 
#' @export
wordstem.character <- function(x, language = "porter") {
    .Deprecated("char_wordstem")
    char_wordstem(x, language = language)
}




