# -------------------------------------------------------------------------------
#   This file is part of 'unityForest'.
#
# 'unityForest' is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# 'unityForest' is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with 'unityForest'. If not, see <http://www.gnu.org/licenses/>.
#
#  NOTE: 'unityForest' is a fork of the popular R package 'ranger', written by Marvin N. Wright.
#  Most R and C++ code is identical with that of 'ranger'. The package 'unityForest'
#  was written by taking the original 'ranger' code and making any
#  changes necessary to implement unity forests.
#
# -------------------------------------------------------------------------------

##' Prediction with new data and a saved forest from \code{\link{unityfor}}.
##'
##' For \code{type = 'response'} (the default), the predicted probabilities (probability estimation), predicted classes (classification), or predicted numeric values (continuous outcomes) are returned. 
##' For \code{type = 'terminalNodes'}, the IDs of the terminal node in each tree for each observation in the given dataset are returned.
##'
##' @title Unity Forest prediction
##' @param object \code{unityfor} object.
##' @param data New test data of class \code{data.frame}.
##' @param predict.all Return individual predictions for each tree instead of aggregated predictions for all trees. Return a matrix (sample x tree) for classification and continuous outcomes, and a 3d array for probability estimation (sample x class x tree).
##' @param num.trees Number of trees used for prediction. The first \code{num.trees} in the forest are used.
##' @param type Type of prediction. Either 'response' (default) or 'terminalNodes'. See below for details.
##' @param num.threads Number of threads to use.
##' The default is to use at most 2 threads (and at most the number of available CPU cores).
##' This conservative default avoids unintentionally using many cores on shared computing resources
##' (e.g., CI systems, servers, or HPC login/compute nodes).
##'
##' For typical use on a personal computer, setting \code{num.threads = 0} is strongly recommended,
##' as it uses all available CPU cores, which typically substantially reduces runtime.
##' @param verbose Verbose output on or off.
##' @param ... further arguments passed to or from other methods.
##' @return Object of class \code{unityfor.prediction} with elements
##'   \tabular{ll}{
##'       \code{predictions}    \tab Predicted classes/probabilities/values/terminal nodes  \cr
##'       \code{num.trees}   \tab Number of trees. \cr
##'       \code{num.independent.variables} \tab Number of independent variables. \cr
##'       \code{treetype}    \tab Type of forest/tree. Categorical or continuous outcome. \cr
##'       \code{num.samples}     \tab Number of samples.
##'   }
##' @references
##' \itemize{
##'   \item Hornung, R., Hapfelmeier, A. (2026). Unity Forests: Improving Interaction Modelling and Interpretability in Random Forests. arXiv:2601.07003, <\doi{10.48550/arXiv.2601.07003}>.
##'   \item Wright, M. N., Ziegler, A. (2017). ranger: A fast Implementation of Random Forests for High Dimensional Data in C++ and R. Journal of Statistical Software 77:1-17, <\doi{10.18637/jss.v077.i01}>.
##'   }
##' @seealso \code{\link{unityfor}}
##' @author Marvin N. Wright, Roman Hornung
##' @export
predict.unityfor <- function(object, data = NULL, predict.all = FALSE,
                           num.trees = object$num.trees,
                           type = "response",
                           num.threads = NULL,
                           verbose = TRUE, ...) {
  forest <- object$forest
  if (is.null(forest)) {
    stop("Error: No saved forest in unityfor object. Please set write.forest to TRUE when calling unityfor.")
  }
  
    ## Non-quantile prediction
    if (is.null(data)) {
     stop("Error: Argument 'data' is required for non-quantile prediction.") 
    }
    predict(forest, data, predict.all, num.trees, type, se.method = "infjack", seed = NULL, num.threads, verbose, forest$inbag.counts, ...)
}

# Author: Marvin N. Wright, Roman Hornung
#' @export
predict.unityfor.forest <- function(object, data, predict.all = FALSE,
                                  num.trees = object$num.trees, 
                                  type = "response", se.method = "infjack",
                                  seed = NULL, num.threads = NULL,
                                  verbose = TRUE, inbag.counts = NULL, ...) {

  ## GenABEL GWA data
  if ("gwaa.data" %in% class(data)) {
    stop("Error: Ordering of SNPs currently not implemented for unity forests.")
  }

  variable.names <- colnames(data)

  ## Check forest argument
  if (!inherits(object, "unityfor.forest")) {
    stop("Error: Invalid class of input object.")
  } else {
    forest <- object
  }
  if (is.null(forest$dependent.varID) || is.null(forest$num.trees) ||
        is.null(forest$child.nodeIDs) || is.null(forest$split.varIDs) ||
        is.null(forest$split.values) || is.null(forest$independent.variable.names) ||
        is.null(forest$treetype)) {
    stop("Error: Invalid forest object.")
  }
  
  ## Prediction type
  if (type == "response" || type == "se") {
    prediction.type <- 1
  } else if (type == "terminalNodes") {
    prediction.type <- 2
  } else {
    stop("Error: Invalid value for 'type'. Use 'response', 'se', 'terminalNodes', or 'quantiles'.")
  }
  
  ## Type "se" only for certain tree types
  if (type == "se" && se.method == "jack" && forest$treetype != "Regression") {
    stop("Error: Jackknife standard error prediction currently only available for regression.")
  }
  if (type == "se" && se.method == "infjack") {
   if (forest$treetype == "Survival") {
     stop("Error: Infinitesimal jackknife standard error prediction not yet available for survival.")
   } else if (forest$treetype == "Classification") {
     stop("Error: Not a probability forest. Set probability=TRUE to use the infinitesimal jackknife standard error prediction for classification.")
   }
  }
  
  ## Type "se" requires keep.inbag=TRUE
  if (type == "se" && is.null(inbag.counts)) {
    stop("Error: No saved inbag counts in divfor object. Please set keep.inbag=TRUE when calling divfor.")
  }
  
  ## Set predict.all if type is "se"
  if (type == "se") {
    predict.all <- TRUE
  }

  ## Create final data
    if (ncol(data) == length(forest$independent.variable.names)+1 && forest$dependent.varID > 0) {
      ## If alternative interface used and same data structure, don't subset data
      data.used <- data
    } else {
      ## If formula interface used, subset data
      data.selected <- data[, forest$independent.variable.names, drop = FALSE]

      ## Arrange data as in original data
      if (forest$dependent.varID == 0) {
        data.used <- cbind(0, data.selected)
        variable.names <- c("dependent", forest$independent.variable.names)
      } else if (forest$dependent.varID >= ncol(data)) {
        data.used <- cbind(data.selected, 0)
        variable.names <- c(forest$independent.variable.names, "dependent")
      } else {
        data.used <- cbind(data.selected[, 1:forest$dependent.varID],
                           0,
                           data.selected[, (forest$dependent.varID+1):ncol(data.selected)])
        variable.names <- c(forest$independent.variable.names[1:forest$dependent.varID],
                            "dependent",
                            forest$independent.variable.names[(forest$dependent.varID+1):length(forest$independent.variable.names)])
      }
    }

    ## Index of no-recode variables
    idx.norecode <- -(forest$dependent.varID+1)

  ## Recode characters
  if (!is.matrix(data.used) && !inherits(data.used, "Matrix")) {
    char.columns <- sapply(data.used, is.character)
    data.used[char.columns] <- lapply(data.used[char.columns], factor)
  }

  ## Recode factors if forest grown 'order' mode
  if (!is.null(forest$covariate.levels) && !all(sapply(forest$covariate.levels, is.null))) {
    data.used[, idx.norecode] <- mapply(function(x, y) {
      if(is.null(y)) {
        x
      } else {
        new.levels <- setdiff(levels(x), y)
        factor(x, levels = c(y, new.levels), exclude = NULL)
      }
    }, data.used[, idx.norecode], forest$covariate.levels, SIMPLIFY = !is.data.frame(data.used[, idx.norecode]))
  }

  ## Convert to data matrix
  if (is.matrix(data.used) || inherits(data.used, "Matrix")) {
    data.final <- data.used
  } else {
    data.final <- data.matrix(data.used)
  }

  ## Check missing values
  if (any(is.na(data.final))) {
    offending_columns <- colnames(data.final)[colSums(is.na(data.final)) > 0]
    stop("Missing data in columns: ",
         paste0(offending_columns, collapse = ", "), ".", call. = FALSE)
  }

  if (sum(!(forest$independent.variable.names %in% variable.names)) > 0) {
    stop("Error: One or more independent variables not found in data.")
  }

  ## Num threads
  ## Default: use at most 2 threads. Set num.threads = 0 to use all available cores.
  if (is.null(num.threads)) {
    num.threads <- 2L
  } else if (!is.numeric(num.threads) || num.threads < 0) {
    stop("Error: Invalid value for num.threads")
  }
  
  ## Respect core limits during R CMD check / CRAN-like checks
  ## (only if running under package checks)
  if (nzchar(Sys.getenv("_R_CHECK_PACKAGE_NAME_", unset = ""))) {
    limit_cores <- Sys.getenv("_R_CHECK_LIMIT_CORES_", unset = "")
    if (nzchar(limit_cores)) {
      lim <- suppressWarnings(as.integer(limit_cores))
      if (!is.na(lim) && lim > 0) {
        if (num.threads == 0) {
          num.threads <- lim
        } else {
          num.threads <- min(num.threads, lim)
        }
      }
    }
  }

  ## Seed
  if (is.null(seed)) {
    seed <- runif(1 , 0, .Machine$integer.max)
  }

  if (forest$treetype == "Classification") {
    treetype <- 1
  } else if (forest$treetype == "Regression") {
    treetype <- 3
  } else if (forest$treetype == "Probability estimation") {
    treetype <- 9
  } else {
    stop("Error: Unknown tree type.")
  }

  ## Defaults for variables not needed
  dependent.variable.name <- ""
  mtry <- 0
  importance <- 0
  min.node.size <- 0
  split.select.weights <- list(c(0, 0))
  use.split.select.weights <- FALSE
  always.split.variables <- c("0", "0")
  use.always.split.variables <- FALSE
  status.variable.name <- "status"
  prediction.mode <- TRUE
  write.forest <- FALSE
  replace <- TRUE
  probability <- FALSE
  unordered.factor.variables <- c("0", "0")
  use.unordered.factor.variables <- FALSE
  save.memory <- FALSE
  splitrule <- 1
  alpha <- 0
  minprop <- 0
  case.weights <- c(0, 0)
  use.case.weights <- FALSE
  class.weights <- c(0, 0)
  keep.inbag <- FALSE
  sample.fraction <- 1
  holdout <- FALSE
  num.random.splits <- 1
  order.snps <- FALSE
  oob.error <- FALSE
  max.depth <- 0
  inbag <- list(c(0,0))
  use.inbag <- FALSE

  ## Use sparse matrix
  if ("dgCMatrix" %in% class(data.final)) {
    sparse.data <- data.final
    data.final <- matrix(c(0, 0))
    use.sparse.data <- TRUE
  } else {
    sparse.data <- Matrix(matrix(c(0, 0)))
    use.sparse.data <- FALSE
  }
  
  repr.tree.mode <- FALSE
  repr.var.names <- ""
  
  ## Call divfor
  result <- divforCpp(treetype, dependent.variable.name, data.final, variable.names, mtry,
                      num.trees, verbose, seed, num.threads, write.forest, importance,
                      min.node.size, min_node_size_root=0, split.select.weights, use.split.select.weights,
                      always.split.variables, use.always.split.variables,
                      status.variable.name, prediction.mode, forest, snp_data=as.matrix(0), replace, probability,
                      unordered.factor.variables, use.unordered.factor.variables, save.memory, splitrule,
                      case.weights, use.case.weights, class.weights, 
                      predict.all, keep.inbag, sample.fraction, alpha, minprop, holdout, 
                      prediction.type, num.random.splits, sparse.data, use.sparse.data,
                      order.snps, oob.error, max.depth, max_depth_root=3, num_cand_trees=1000, inbag, use.inbag, prop_var_root=0, 
					  prop_best_splits=0, repr.tree.mode, repr.var.names) ## asdf

  if (length(result) == 0) {
    stop("User interrupt or internal error.")
  }

  ## Prepare results
  result$num.samples <- nrow(data.final)
  result$treetype <- forest$treetype

  if (predict.all) {
    if (forest$treetype %in% c("Classification", "Regression")) {
      if (is.list(result$predictions)) {
        result$predictions <- do.call(rbind, result$predictions)
      } else {
        result$predictions <- array(result$predictions, dim = c(1, length(result$predictions)))
      }
    } else {
      ## TODO: Better solution for this?
      result$predictions <- aperm(array(unlist(result$predictions), 
                                        dim = rev(c(length(result$predictions), 
                                                    length(result$predictions[[1]]), 
                                                    length(result$predictions[[1]][[1]])))))
    }
  } else {
    if (is.list(result$predictions)) {
      result$predictions <- do.call(rbind, result$predictions)
    } 
  }
  
  if (type == "response") {
    if (forest$treetype == "Classification" && !is.null(forest$levels)) {
      if (!predict.all) {
        result$predictions <- integer.to.factor(result$predictions, forest$levels)
      }
    } else if (forest$treetype == "Regression") {
      ## Empty
    } else if (forest$treetype == "Probability estimation" && !is.null(forest$levels)) {
      if (!predict.all) {
        if (is.vector(result$predictions)) {
          result$predictions <- matrix(result$predictions, nrow = 1)
        }
        
        ## Set colnames and sort by levels
        colnames(result$predictions) <- forest$levels[forest$class.values]
        result$predictions <- result$predictions[, forest$levels[sort(forest$class.values)], drop = FALSE]
      }
    }
  } else if (type == "terminalNodes") {
    if (is.vector(result$predictions)) {
      result$predictions <- matrix(result$predictions, nrow = 1)
    }
  }

  ## Compute Jackknife
  if (type == "se") {
    ## Aggregated predictions
    if (length(dim(result$predictions)) > 2) {
      yhat <- apply(result$predictions, c(1, 2), mean)
    } else {
      yhat <- rowMeans(result$predictions)
    }

    ## Get inbag counts, keep only observations that are OOB at least once
    inbag.counts <- simplify2array(inbag.counts) 
    if (is.vector(inbag.counts)) {
      inbag.counts <- t(as.matrix(inbag.counts))
    }
    inbag.counts <- inbag.counts[rowSums(inbag.counts == 0) > 0, , drop = FALSE] 
    n <- nrow(inbag.counts)
    oob <- inbag.counts == 0
    if (num.trees != object$num.trees) {
      oob <- oob[, 1:num.trees]
    }
    
    if (all(!oob)) {
      stop("Error: No OOB observations found, consider increasing num.trees or reducing sample.fraction.")
    }

    if (se.method == "jack") {
      ## Compute Jackknife
      oob.count <- rowSums(oob)
      jack.n <- sweep(tcrossprod(result$predictions, oob), 
                      2, oob.count, "/", check.margin = FALSE)
      if (is.vector(jack.n)) {
        jack.n <- t(as.matrix(jack.n))
      }
      if (any(oob.count == 0)) {
        n <- sum(oob.count > 0)
        jack.n <- jack.n[, oob.count > 0]
      } 
      jack <- (n - 1) / n * rowSums((jack.n - yhat)^2)
      bias <- (exp(1) - 1) * n / result$num.trees^2 * rowSums((result$predictions - yhat)^2)
      jab <- pmax(jack - bias, 0)
      result$se <- sqrt(jab)
    } else if (se.method == "infjack") {
	  stop("Error: 'se.method == infjack' currently not supported for unity forests.")
      #if (forest$treetype == "Regression") {
      #  infjack <- rInfJack(pred = result$predictions, inbag = inbag.counts, used.trees = 1:num.trees)
      #  result$se <- sqrt(infjack$var.hat)
      #} else if (forest$treetype == "Probability estimation") {
      #  infjack <- apply(result$predictions, 2, function(x) {
      #    rInfJack(x, inbag.counts)$var.hat
      #  })
      #  result$se <- sqrt(infjack)
      #} 
    } else {
      stop("Error: Unknown standard error method (se.method).")
    }
    
    ## Response as predictions
    result$predictions <- yhat
    
    if (forest$treetype == "Probability estimation") {
      ## Set colnames and sort by levels
      colnames(result$predictions) <- forest$levels[forest$class.values]
      result$predictions <- result$predictions[, forest$levels, drop = FALSE]
    }
  }

  class(result) <- "unityfor.prediction"
  return(result)
}
