context("tokens")


test_that("'tokens' splits according to UAX #29", {
    text <- paste0("The quick (\u201cbrown\u201d) fox can\u2019t",
                   " jump 32.3 feet, right?")

    toks <- tokens(text, filter=NULL)

    expect_equal(toks, list(
        c("The", " ", "quick", " ", "(", "\u201c", "brown", "\u201d", ")",
         " ", "fox", " ", "can\u2019t", " ", "jump", " ", "32.3", " ",
         "feet", ",", " ", "right", "?")))
})


test_that("'tokens' normalizes tokens by default", {
    text <- paste0("The quick (\u201cbrown\u201d) fox can\u2019t",
                   " jump 32.3 feet, right?")

    toks <- tokens(text)

    expect_equal(toks, list(
        c("the", "quick", "(", "'", "brown", "'", ")", "fox", "can't",
          "jump", "32.3", "feet", ",", "right", "?")))
})


test_that("'tokens' propagates names if its argument has them", {
    text <- text(a="First sentence.", b="Second sentence!")
    ctext <- c(a="First sentence.", b="Second sentence!")

    toks <- tokens(text)
    expect_equal(toks, list(
        a=c("first", "sentence", "."),
        b=c("second", "sentence", "!")))

    ctoks <- tokens(ctext)
    expect_equal(ctoks, list(
        a=c("first", "sentence", "."),
        b=c("second", "sentence", "!")))
})


test_that("'tokens' works on empty values", {
    expect_equal(tokens(c()), list())
})


test_that("'tokens' works on empty and missing values", {
    expect_equal(tokens(c("1", "2", "", NA, "5")),
                 list("1", "2", character(), NA_character_, "5"))
})


test_that("'tokens' should work on S3 objects", {
    x <- structure(c(a="I LIKE TO SHOUT!! HA HA!",
                     b="There's no need. For that.",
                     c="Why not? :("),
                   class="upper")
    as.character.upper <<- function(x) sapply(unclass(x), toupper)

    x2 <- as.character(x)
    names(x2) <- names(x)

    toks <- tokens(x)
    toks2 <- tokens(x2)
    expect_equal(toks, toks2)
})
