#' @rdname decode
setGeneric("decode", function(.Object, ...) standardGeneric("decode"))

setOldClass("Annotation")
setOldClass("AnnotatedPlainTextDocument")


setAs(from = "corpus", to = "Annotation", def = function(from){
  
  # Implemented only for class 'corpus', 'subcorpus'-class will inherit from it.
  
  if (!requireNamespace(package = "NLP", quietly = TRUE))
    stop("Package 'NLP' required but not available")

  word <- get_token_stream(from@cpos, corpus = from@corpus, p_attribute = "word", encoding = from@encoding)
  pos <- get_token_stream(from@cpos, corpus = from@corpus, p_attribute = "pos", encoding = from@encoding)
  whitespace_after <- c(ifelse(pos %in% c("$.", "$,"), FALSE, TRUE)[2L:length(pos)], FALSE)
  word_with_whitespace <- paste(word, ifelse(whitespace_after, " ", ""), sep = "")
  s <- paste(word_with_whitespace, collapse = "")
  word_length <- sapply(word, nchar)
  left_offset <- c(1L, (cumsum(sapply(word_with_whitespace, nchar)) + 1L)[1L:(length(word) - 1L)] )
  names(left_offset) <- word
  right_offset <- left_offset + word_length - 1L
  names(right_offset) <- word
  cpos <- cpos(from@cpos)
  w <- NLP::Annotation(
    id = cpos,
    rep.int("word", length(cpos)),
    start = left_offset,
    end = right_offset
  )

  right_offset <- left_offset + word_length
  names(right_offset) <- word # repeats 
  m <- matrix(data = c(left_offset, right_offset), ncol = 2, byrow = FALSE)
  f <- cut(x = 1L:length(pos), breaks = unique(c(1L, grep("\\$\\.", pos), length(pos))), include.lowest = TRUE)
  chunks <- split(x = m, f = f)
  sentence_left <- sapply(chunks, min)
  sentence_right <- sapply(chunks, max) - 1L
  s <- NLP::Annotation(
    id = 1L:length(sentence_left),
    rep.int("sentence", length(sentence_left)),
    start = sentence_left,
    end = sentence_right
  )
  c(w, s)
})


setAs(from = "corpus", to = "AnnotatedPlainTextDocument", def = function(from){
  
  # Implemented only for class 'corpus', 'subcorpus'-class will inherit from it.
  
  if (!requireNamespace(package = "NLP", quietly = TRUE))
    stop("Package 'NLP' required but not available.")
  
  message("... decode p-attributes")
  p_attrs <- p_attributes(from)
  ts <- decode(from, p_attribute = p_attrs, s_attributes = character())

  message("... generate plain text string")
  whitespace_after <- if ("pos" %in% p_attrs){
    # still assumes that STSS is used
    c(ifelse(ts[["pos"]] %in% c("$.", "$,"), FALSE, TRUE)[2L:nrow(ts)], FALSE)
  } else {
    c(grepl("^[.,;!?]$", ts[["word"]])[2L:nrow(ts)], FALSE)
  }
  
  word_with_whitespace <- paste(ts[["word"]], ifelse(whitespace_after, " ", ""), sep = "")
  s <- paste(word_with_whitespace, collapse = "")
  
  message("... generate token-level annotation")
  left_offset <- c(1L, (cumsum(nchar(word_with_whitespace)) + 1L)[1L:(nrow(ts) - 1L)])
  right_offset <- left_offset + nchar(ts[["word"]]) - 1L
  
  w <- NLP::Annotation(
    id = cpos(from@cpos),
    rep.int("word", nrow(ts)),
    start = left_offset,
    end = right_offset,
    features = lapply(split(ts[, p_attrs, with = FALSE], f = 1L:nrow(ts)), as.data.frame)
  )
  
  message("... which s-attributes are document-level metadata?")
  s_attrs <- s_attributes(from)
  meta_candidates <- lapply(
    setNames(s_attrs, s_attrs),
    function(s_attr){
      message(sprintf("... checking if s-attribute is metadata: %s ... ", s_attr), appendLF = FALSE)
      strucs <- cl_cpos2struc(corpus = from@corpus, s_attribute = s_attr, cpos = cpos(from@cpos))
      if (any(strucs < 0L)){
        message("NO (not comprehensive)")
        NULL
      } else {
        values <- cl_struc2str(corpus = from@corpus, s_attribute = s_attr, struc = strucs)
        unique_values <- unique(values)
        if (length(unique_values) == 1L){
          message("OK")
          return(unique_values)
        } else {
          message("NO (changes)")
          character()
        }
      }
    }
  )
  meta <- meta_candidates
  # attributes that do not cover the entire subcorpus and that have different
  # values are not metadata - discard it
  for (i in rev(which(sapply(meta_candidates, length) != 1L))) meta[[i]] <- NULL
  
  # s-attributes that do not cover entire subcorpus (negative values) are 
  # annotations of regions of text
  s_attr_anno <- names(meta_candidates)[sapply(meta_candidates, is.null)]
  mw_annotations <- do.call(c, lapply(
    s_attr_anno,
    function(s_attr){
      strucs <- unique(
        cl_cpos2struc(corpus = from@corpus, s_attribute = s_attr, cpos = cpos(from@cpos))
      )
      strucs <- strucs[which(strucs >= 0L)]
      do.call(c,
        lapply(
          strucs,
          function(struc){
            cpos <- cl_struc2cpos(corpus = from@corpus, s_attribute = s_attr, struc = struc)
            NLP::Annotation(
              id = -1, # assign id later on, if necessary
              type = s_attr,
              start = w[which(w$id == min(cpos))]$start,
              end = w[which(w$id == max(cpos))]$end,
              features = list(list(constituents = cpos))
            )
          }
        )
      )
    }
  ))
  
  a <- if (length(mw_annotations) > 0L) c(w, mw_annotations) else w
  
  NLP::AnnotatedPlainTextDocument(s = s, a = a, meta = meta)
})


#' Decode corpus or subcorpus.
#' 
#' Decode \code{corpus} or \code{subcorpus} and return class specified by
#' argument \code{to}.
#'
#' The primary purpose of the method is type conversion. By obtaining the corpus
#' or subcorpus in the format specified by the argument \code{to}, the data can
#' be processed with tools that do not rely on the Corpus Workbench (CWB).
#' Supported output formats are \code{data.table} (which can be converted to a
#' \code{data.frame} or \code{tibble} easily) or an \code{Annotation} object as
#' defined in the package \code{NLP}. Another purpose of decoding the corpus can
#' be to rework it, and to re-import it into the CWB (e.g. using the
#' \code{cwbtools}-package).
#'
#' An earlier version of the method included an option to decode a single
#' s-attribute, which is not supported any more. See the
#' \code{s_attribute_decode} function of the package RcppCWB.
#'
#' @return The return value will correspond to the class specified by argument
#'   \code{to}.
#' 
#' @param .Object The \code{corpus} or \code{subcorpus} to decode.
#' @param to The class of the returned object, stated as a length-one
#'   \code{character} vector.
#' @param s_attributes The structural attributes to decode. If \code{NULL}
#'   (default), all structural attributes will be decoded.
#' @param p_attributes The positional attributes to decode. If \code{NULL}
#'   (default), all positional attributes will be decoded.
#' @param decode A \code{logical} value, whether to decode token ids and struc
#'   ids to character strings. If \code{FALSE}, the values of columns for p- and
#'   s-attributes will be \code{integer} vectors. If \code{TRUE} (default), the
#'   respective columns are \code{character} vectors.
#' @param ... Further arguments.
#' @param verbose A \code{logical} value, whether to output progess messages.
#' @exportMethod decode
#' @importFrom RcppCWB get_region_matrix
#' @seealso To decode a structural attribute, you can use the
#'   \code{\link{s_attributes}}-method, setting argument \code{unique} as
#'   \code{FALSE} and \code{\link[RcppCWB]{s_attribute_decode}}. See
#'   \code{\link{as.VCorpus}} to decode a \code{partition_bundle} object,
#'   returning a \code{VCorpus} object.
#' @examples
#' use("polmineR")
#' use(pkg = "RcppCWB", corpus = "REUTERS")
#' 
#' # Decode corpus as data.table
#' dt <- decode("GERMAPARLMINI", to = "data.table")
#' 
#' # Decode corpus selectively
#' dt <- decode("GERMAPARLMINI", to = "data.table", p_attributes = "word", s_attributes = "party")
#' 
#' # Decode a subcorpus
#' dt <- corpus("GERMAPARLMINI") %>%
#'   subset(speaker == "Angela Dorothea Merkel") %>%
#'   decode(s_attributes = c("speaker", "party", "date"), to = "data.table")
#' 
#' # Decode subcorpus selectively
#' corpus("GERMAPARLMINI") %>%
#'   subset(speaker == "Angela Dorothea Merkel") %>%
#'   decode(to = "data.table", p_attributes = "word", s_attributes = "party")
#' 
#' # Decode partition
#' P <- partition("REUTERS", places = "kuwait", regex = TRUE)
#' dt <- decode(P)
#' 
#' # Previous versions of polmineR offered an option to decode a single
#' # s-attribute. This is how you could proceed to get a table with metadata.
#' dt <- decode(P, s_attribute = "id", decode = FALSE)
#' dt[, "word" := NULL]
#' dt[,{list(cpos_left = min(.SD[["cpos"]]), cpos_right = max(.SD[["cpos"]]))}, by = "id"]
#' 
#' # Decode subcorpus as Annotation object
#' \dontrun{
#' if (requireNamespace("NLP")){
#'   library(NLP)
#'   p <- corpus("GERMAPARLMINI") %>%
#'     subset(date == "2009-11-10" & speaker == "Angela Dorothea Merkel")
#'   s <- as(p, "String")
#'   a <- as(p, "Annotation")
#'   
#'   # The beauty of having this NLP Annotation object is that you can now use 
#'   # the different annotators of the openNLP package. Here, just a short scenario
#'   # how you can have a look at the tokenized words and the sentences.
#' 
#'   words <- s[a[a$type == "word"]]
#'   sentences <- s[a[a$type == "sentence"]] # does not yet work perfectly for plenary protocols 
#'   
#'   doc <- as(p, "AnnotatedPlainTextDocument")
#' }
#' }
#' @rdname decode
setMethod("decode", "corpus", function(.Object, to = c("data.table", "Annotation"), p_attributes = NULL, s_attributes = NULL, decode = TRUE, verbose = TRUE){
  if (to == "data.table"){

    if (is.null(p_attributes)) p_attributes <- p_attributes(.Object)
    if (!all(p_attributes %in% p_attributes(.Object)))
      stop("Not all p_attributes provided are available.")
    

    p_attribute_list <- lapply(
      setNames(p_attributes, p_attributes),
      function(p_attr){
        if (verbose) message("decoding p-attribute:", p_attr)
        get_token_stream(.Object, p_attribute = p_attr, decode = decode)
      }
    )

    max_cpos <- size(.Object) - 1L
    
    if (is.null(s_attributes)) s_attributes <- s_attributes(.Object)
    if (length(s_attributes) > 0L){
      if (!all(s_attributes %in% s_attributes(.Object)))
        stop("Not all s_attributes provided are available.")
    }
    
    if (length(s_attributes) > 0L){
      s_attribute_list <- lapply(
        setNames(s_attributes, s_attributes),
        function(s_attr){
          if (verbose) message("decoding s-attribute:", s_attr)
          struc <- cl_cpos2struc(
            corpus = .Object@corpus, registry = .Object@registry_dir,
            s_attribute = s_attr, cpos = 0L:max_cpos
          )
          if (decode){
            str <- cl_struc2str(
              corpus = .Object@corpus, registry = .Object@registry_dir,
              s_attribute = s_attr, struc = struc
            )
            Encoding(str) <- encoding(.Object)
            return(as.nativeEnc(str, from = encoding(.Object)))
          } else {
            return(struc)
          }
        }
      )
    } else {
      s_attribute_list <- list()
    }

    message("assembling data.table")
    combined_list <- c(
      list(cpos = 0L:max_cpos),
      p_attribute_list,
      s_attribute_list
    )
    y <- data.table::as.data.table(combined_list)
    
  } else if (to == "Annotation"){
    y <- as(.Object, "Annotation")
  } else {
    stop("The 'to'-argument of the decode()-method is required to be either 'data.table' or 'Annotation'.")
  }
  y
})


#' @exportMethod decode
#' @rdname decode
setMethod("decode", "character", function(.Object, to = c("data.table", "Annotation"), s_attributes = NULL, p_attributes = NULL, decode = TRUE, verbose = TRUE){
  decode(corpus(.Object), to = to, s_attributes = s_attributes, p_attributes = p_attributes, decode = decode, verbose = verbose)
})





#' @exportMethod decode
#' @rdname decode
setMethod("decode", "slice", function(.Object, to = "data.table", s_attributes = NULL, p_attributes = NULL, decode = TRUE, verbose = TRUE){
  if (to == "data.table"){
    
    if (is.null(p_attributes)) p_attributes <- p_attributes(.Object)
    if (!all(p_attributes %in% p_attributes(.Object))) stop("Not all p_attributes provided are available.")
    
    y <- data.table(cpos = cpos(.Object@cpos))

    for (p_attr in p_attributes){
      if (verbose) message("... decoding p_attribute ", p_attr)
      y[, (p_attr) := get_token_stream(.Object, p_attribute = p_attr, decode = decode)]
    }
    
    if (is.null(s_attributes)) s_attributes <- s_attributes(.Object)
    if (length(s_attributes) > 0L){
      if (!all(s_attributes %in% s_attributes(.Object))) stop("Not all s_attributes provided are available.")
      strucs <- RcppCWB::cl_cpos2struc(
        corpus = .Object@corpus, registry = .Object@registry_dir,
        s_attribute = s_attributes[1], cpos = .Object@cpos[,1]
      )
      
      s_attr_dt <- data.table(
        RcppCWB::get_region_matrix(
          corpus = .Object@corpus, registry = .Object@registry_dir,
          s_attribute = s_attributes[1], strucs = strucs
        )
      )
      setnames(s_attr_dt, old = c("V1", "V2"), new = c("cpos_left", "cpos_right"))
      s_attr_dt[, "struc" := strucs]
      
      for (s_attr in s_attributes){
        if (decode){
          if (verbose) message("... decoding s_attribute ", s_attr)
          str <- cl_struc2str(corpus = .Object@corpus, registry = .Object@registry_dir, s_attribute = s_attr, struc = strucs)
          Encoding(str) <- encoding(.Object)
          s_attr_dt[, (s_attr) := as.nativeEnc(str, from = encoding(.Object))]
        } else {
          s_attr_dt[, (s_attr) := strucs]
        }
      }
      
      unfold <- function(.SD){
        dt <- data.table(cpos = .SD[["cpos_left"]]:.SD[["cpos_right"]])
        for (s_attr in s_attributes){
          value <- .SD[[s_attr]]
          dt[, (s_attr) := value]
        }
        dt
      }
      s_attr_dt_ext <- s_attr_dt[, unfold(.SD), by = "struc"]
      y <- y[s_attr_dt_ext, on = "cpos"]
      setcolorder(y, neworder = c("cpos", p_attributes, s_attributes))
    }
  } else if (to == "Annotation"){
    y <- as(.Object, "Annotation")
  }
  y
})

#' @rdname decode
setMethod("decode", "partition", function(.Object, to = "data.table", s_attributes = NULL, p_attributes = NULL, decode = TRUE, verbose = TRUE){
  callNextMethod()
})


#' @exportMethod decode
#' @rdname decode
setMethod("decode", "subcorpus", function(.Object, to = "data.table", s_attributes = NULL, p_attributes = NULL, decode = TRUE, verbose = TRUE){
  callNextMethod()
})


#' @details If \code{.Object} is an \code{integer} vector, it is assumed to be a
#'   vector of integer ids of p-attributes. The \code{decode}-method will
#'   translate token ids to string values as efficiently as possible. The
#'   approach taken will depend on the corpus size and the share of the corpus
#'   that is to be decoded. To decode a large number of integer ids, it is more
#'   efficient to read the lexicon file from the data directory directly and to
#'   index the lexicon with the ids rather than relying on
#'   \code{RcppCWB::cl_id2str}. The internal decision rule is to use the lexicon
#'   file when the corpus is larger than 10 000 000 million tokens and more than
#'   5 percent of the corpus are to be decoded. The encoding of the
#'   \code{character} vector that is returned will be the coding of the locale
#'   (usually ISO-8859-1 on Windows, and UTF-8 on macOS and Linux machines).
#' @param boost A length-one \code{logical} value, whether to speed up decoding
#'   a long vector of token ids by directly by reading in the lexion file from
#'   the data directory of a corpus. If \code{NULL} (default), the internal
#'   decision rule is that \code{boost} will be \code{TRUE} if the corpus is
#'   larger than 10 000 000 million tokens and more than 5 percent of the corpus
#'   are to be decoded.
#' @param corpus A CWB indexed corpus, either a length-one \code{character} vector,
#'   or a \code{corpus} object.
#' @exportMethod decode
#' @rdname decode
#' @examples
#'  
#' # decode vector of token ids
#' y <- decode(0:20, corpus = "GERMAPARLMINI", p_attributes = "word")
#' @importFrom stringi stri_encode
setMethod("decode", "integer", function(.Object, corpus, p_attributes, boost = NULL){
  stopifnot(
    length(corpus) == 1L,
    length(p_attributes) == 1L,
    is.character(p_attributes)
  )
  corpus <- if (is.character(corpus)) corpus(corpus) else corpus
  if (!inherits(corpus, "corpus")) stop("Argument 'corpus' is required to be a corpus object.")
  
  if (is.null(boost)){
    boost <- if (corpus@size > 10000000L && length(cpos) >= (corpus@size * 0.05)) TRUE else FALSE
  }
  
  if (isTRUE(boost)){
    lexfile <- fs::path(corpus@data_dir, sprintf("%s.lexicon", p_attributes))
    lexicon <- readBin(con = lexfile, what = character(), n = file.info(lexfile)$size)
    Encoding(lexicon) <- corpus@encoding
    if (!identical(corpus@encoding, encoding())){
      # lexicon <- stringi::stri_encode(lexicon, from = corpus@encoding, to = encoding()) # as.locale
      lexicon <- iconv(lexicon, from = corpus@encoding, to = encoding())
      Encoding(lexicon) <- encoding()
    }
    y <- lexicon[.Object + 1L]
  } else if (isFALSE(boost)){
    y <- RcppCWB::cl_id2str(
      corpus = corpus@corpus, registry = corpus@registry_dir,
      p_attribute = p_attributes, id = .Object
    )
    Encoding(y) <- corpus@encoding
    if (!identical(corpus@encoding, encoding())){
      # y <- stringi::stri_encode(y, from = corpus@encoding, to = encoding())
      y <- iconv(y, from = corpus@encoding, to = encoding())
      Encoding(y) <- encoding()
    }
  }
  y
})


#' @details The \code{decode}-method for \code{data.table} objects will decode
#'   token ids (column '`p-attribute`_id'), adding the corresponding string as a
#'   new column. If a column "cpos" with corpus positions is present, ids are
#'   derived for the corpus positions given first. If the \code{data.table}
#'   neither has a column "cpos" nor columns with token ids (i.e. colummn name
#'   ending with "_id"), the input \code{data.table} is returned unchanged. Note
#'   that columns are added to the \code{data.table} in an in-place operation to
#'   handle memory parsimoniously.
#' @examples 
#' dt <- data.table::data.table(cpos = cpos("GERMAPARLMINI", query = "Liebe")[,1])
#' decode(dt, corpus = "GERMAPARLMINI", p_attributes = c("word", "pos"))
#' y <- dt[, .N, by = c("word", "pos")]
#' @rdname decode
setMethod("decode", "data.table", function(.Object, corpus, p_attributes){
  corpus <- if (is.character(corpus)) corpus(corpus) else corpus
  if (!inherits(corpus, "corpus")) stop("Argument 'corpus' is required to be a corpus object.")
  if (isFALSE(all(p_attributes %in% p_attributes(corpus)))){
    stop("At least one p-attribute is not available.")
  }
  
  if ("cpos" %in% colnames(.Object)){
    for (p_attr in p_attributes){
      p_attr_id <- paste(p_attr, "id", sep = "_")
      ids <- cpos2id(corpus, p_attribute = p_attr, cpos = .Object[["cpos"]])
      .Object[, (p_attr_id) := ids]
    }
  }
  
  for (p_attr in p_attributes){
    p_attr_id <- paste(p_attr, "id", sep = "_")
    if (p_attr_id %in% colnames(.Object)){
      .Object[, (p_attr) := decode(.Object[[p_attr_id]], corpus = corpus, p_attributes = p_attr)]
    }
  }
  
  .Object
})
