#Cross Validation for SES

#INPUT
#target as in SES
#dataset as in SES
#kfolds: number of folds (integer)
#folds: already defined folds of the data to use (a list generated by generateCVRuns {TunePareto}). If NULL the folds created internally with the same function
#alphas: vector of SES alphas hyper parameters used in CV. Default is c(0.1, 0.05, 0.01)
#maxk_s: vector of SES max_ks parameters used in CV. Default is c(3, 2)
#task: character, it can be "C" for classification (logistic regression classifier), "R" for regression (linear regression classifier), "S" for cox survival analysis (cox regression classifier)
#metric: a metric function provided by the user or auto defined due to the task. It may be NULL or a function in the form of other metric functions (e.g., mse.mxm). For example the default for the classification task is auc.mxm but the user can also define acc.mxm (based on the accuracy metric) that is supported on the package. Or the user can make his own metric function that follows the signature and the inputs, outputs of ours.
#modeler: a modeling function provided by the user or auto defined due to the task if it is NULL (e.g., lm.mxm)
#ses_test: A function object that defines the test used in the SES function (see SES help page for more). If it is NULL, its is auto defined due to the task.

#OUTPUT
#a list called best_model with the below slots
#cv_results_all: a list with the predictions, performances and the signatures for each fold of each configuration (i.e cv_results_all[[3]]$performances[1] indicates the performance of the 1st fold with the 3d configuration of SES)
#best_performance: numeric, the best average performance
#best_configuration: the best configuration of SES (a list with the slots id, a, max_k)

cv.ses <- function(target, dataset, kfolds = 10, folds = NULL, alphas = NULL, max_ks = NULL, task = NULL, metric = NULL, modeler = NULL, ses_test = NULL)
{
  if(is.null(alphas))
  {
    alphas <- c(0.1, 0.05, 0.01)
  }
  if(is.null(max_ks))
  {
    max_ks <- c(3, 2)  
  }
  
  alphas = sort(alphas, decreasing = TRUE)
  max_ks = sort(max_ks, decreasing = TRUE)
  
  nAlpha <- length(alphas);
  nMax_ks <- length(max_ks);
  
  #defining the SES configurations
  nSESConfs <- nAlpha*nMax_ks;
  SES_configurations <- vector("list" , nSESConfs);
  i <- 0;
  for(a in alphas){
    for(k in max_ks){
      configuration <- NULL;
      i <- i + 1;
      configuration$id <- i;
      configuration$a <- a;
      configuration$max_k <- k;
      SES_configurations[[i]] <- configuration;
    }
  }
  
  if(is.null(folds))
  {
    folds = generateCVRuns(train_target, ntimes = 1, nfold = kfolds, leaveOneOut = FALSE, stratified = TRUE)
  }else{
    kfolds <- length(folds[[1]]);
  }
  
  if(task == 'C'){
    
    #Classification task (logistic regression)
    if (is.null(metric)){
      metricFunction <- auc.mxm;
    }else{
      metricFunction <- metric;
    }
    
    if (is.null(modeler)){
      modelerFunction <- glm.mxm;
    }else{
      modelerFunction <- modeler;
    }
    
    if (is.null(ses_test)){
      test <- 'testIndLogistic';
    }else{
      test <- ses_test;
    }
    
  }else if(task == 'R'){
    
    #Regression task (logistic regression)
    if (is.null(metric)){
      metricFunction <- mse.mxm;
    }else{
      metricFunction <- metric;
    }
    
    if (is.null(modeler)){
      modelerFunction <- lm.mxm;
    }else{
      modelerFunction <- modeler;
    }
    
    if (is.null(ses_test)){
      test = 'testIndFisher';
    }else{
      test <- ses_test;
    }
    
  }else if(task == 'S'){
    
    #cox survival analysis (cox regression)
    if (is.null(metric)){
      metricFunction <- ci.mxm;
    }else{
      metricFunction <- metric;
    }
    
    if (is.null(modeler)){
      modelerFunction <- coxph.mxm;
    }else{
      modelerFunction <- modeler;
    }
    
    if (is.null(ses_test)){
      test = "censIndLR";
    }else{
      test <- ses_test;
    }
    
  }else{
    stop("Please provide a valid task argument 'C'-classification, 'R'-regression, 'S'-survival.")
  }
  
  nSESConfs = length(SES_configurations)
  #merging SES configuration lists and create the general cv results list
  conf_ses <- vector("list" , nSESConfs)
  for(i in 1:nSESConfs){
    conf_ses[[i]]$configuration <- SES_configurations[[i]]
    conf_ses[[i]]$preds <- vector('list', kfolds)
    conf_ses[[i]]$performances <- vector('numeric', kfolds)
    conf_ses[[i]]$signatures <- vector('list', kfolds)
  }
  
  for(k in 1:kfolds){
    #print(paste('CV: Fold', k, 'of', kfolds));
    train_samples <- c();
    for(i in which(c(1:kfolds) != k))
    {
      train_samples = c(train_samples, folds[[1]][[i]])
    }
    
    #leave one fold out each time as a test set and the rest as train set
    train_set <- dataset[train_samples, ] #Set the training set
    train_target <- target[train_samples]
    test_set <- dataset[folds[[1]][[k]], ] #Set the validation set
    test_target <- target[folds[[1]][[k]]]
    
    #SES hashmap
    SESHashMap = NULL;
    
    #for each conf of SES
    for(ses_conf_id in 1:nSESConfs){
      
      #SES options
      threshold <- SES_configurations[[ses_conf_id]]$a;
      max_k <- SES_configurations[[ses_conf_id]]$max_k;
      
      #running SES
      results <- MXM::SES(train_target, train_set, max_k, threshold, test = test, hash <- TRUE, hashObject = SESHashMap)
      
      SESHashMap <- results@hashObject;
      signatures <- results@signatures;

      #recording the selected signatures
      conf_ses[[ses_conf_id]]$signatures[[k]] <- signatures;
      
      #get the data of the reference signature (i.e the selected variables)
      curr_sign <- as.matrix(signatures[1, ])
      #curr_sign <- as.matrix(results@selectedVars) #in case that the signature slot is not returned due to lack of memory. See InternalSES final part.
      sign_data <- as.matrix(train_set[ ,curr_sign])
      sign_test <- as.matrix(test_set[ ,curr_sign]);
      
      if(dim(signatures)[1] >= 1 && length(results@selectedVars) > 0)
      {
        #generate a model due to the task and find the performance
        #logistic model for a classification task, linear model for the regression task and a cox model for the survival task
        
        preds<-modelerFunction(train_target, sign_data, sign_test)
        
        if(is.null(preds))
        {
          conf_ses[[ses_conf_id]]$preds[[k]] <- NULL
          conf_ses[[ses_conf_id]]$performances[k] <- NA
        }
        else{
          performance = metricFunction(preds, test_target)
          conf_ses[[ses_conf_id]]$preds[[k]] <- preds
          conf_ses[[ses_conf_id]]$performances[k] <- performance
        }
      }else{
        conf_ses[[ses_conf_id]]$preds[[k]] <- NULL
        conf_ses[[ses_conf_id]]$performances[k] <- NA
      }
      
    }
    
    #clear the hashmap and garbages
    if(is.null(SESHashMap$pvalue_hash) == FALSE)
    {
      hash::clear(SESHashMap$pvalue_hash)
    }
    if(is.null(SESHashMap$stat_hash) == FALSE)
    {
      hash::clear(SESHashMap$stat_hash)
    }
    rm(SESHashMap);
    gc();
  }
  
  #finding the best performance for the metric  
  index = 1;
  best_perf = mean(conf_ses[[1]]$performances, na.rm = TRUE);
  for(i in 2:length(conf_ses)){
    averagePerf <- mean(conf_ses[[i]]$performances, na.rm = TRUE);
    if(is.na(averagePerf) == FALSE && is.na(best_perf) == FALSE){
      if(averagePerf < best_perf){
        best_perf <- averagePerf;
        index <- i;
      }
    }
  }
  
  
  #recording the best results
  best_model <- NULL
  best_model$cv_results_all <- conf_ses;
  best_model$best_performance <- best_perf
  best_model$best_configuration = conf_ses[[index]]$configuration
  
  
  res <- array( dim = c( length(alphas), length(max_ks), kfolds ) )
  mat <- matrix(nrow = length(best_model[[ 1 ]]), ncol = kfolds)
  
  for ( i in 1:nrow(mat) ) {
    mat[i, ] <- as.vector( best_model[[ 1 ]][[ i ]]$performances )  
  }
  
  opti <- rowMeans(mat)
  bestpar <- which.min(opti)
  estb <- mean( colMeans(mat) - min(opti) )
  
  best_model$BC_best_perf <- best_model$best_performance - estb
  
  return(best_model)
  
}

#metric functions
#input
#predictions
#test_target

#output
#the metric value (numeric)


#metric functions (use of ROCR package)
#auc
auc.mxm <- function(predictions, test_target){
  predsObj <- prediction(predictions, test_target)
  aucValue <- performance(predsObj, measure='auc')@y.values[[1]];
  return(aucValue);
}

#accuracy
acc.mxm <- function(predictions, test_target){
  accValue <- mean((predictions>0.5) == test_target)
  return(accValue);
}

#mse lower values indicate better performance so we multiply with -1 in order to have higher values for better performances
mse.mxm <- function(predictions, test_target){
  mse <- mean((predictions - test_target)^2)
  return(-mse);
}

#cindex
ci.mxm <- function(predictions, test_target){
  #Hmisc package required
  
  ci = 1 - rcorr.cens(predictions, test_target)[1];
  
  return(ci);
}

#Poisson deviance. Lower values indicate better performance so we multiply with -1 in order to have higher values for better performances
poisdev.mxm <- function(predictions, test_target) {
 return(- 2 * sum( test_target * log(test_target / predictions) ) )
}

#Negative binomial deviance. Lower values indicate better performance so we multiply with -1 in order to have higher values for better performances
nbdev.mxm <- function(predictions, test_target, theta) {
  dev = 2 * sum( test_target * log(test_target / predictions), na.rm = T ) -
  2 * sum( ( test_target + theta ) * log( (test_target + theta) / (predictions + theta) ) )
  return( - dev  )
}  


#Modeling Functions

#input
#train_target
#sign_data
#sign_test

#output
#preds

glm.mxm <- function(train_target, sign_data, sign_test){
#   if(dim(sign_data)[2] == 1)
#   {
#     return(NULL);
#   }else{
  
    #using this variable x to overcome the structure naming problems when we have just one variable as a sign_data. For more on this contact athineou ;)
    x = sign_data
    # sign_model <- glm(train_target ~ ., data = data.frame(sign_data), family = binomial());
    sign_model <- glm( train_target ~ ., data = data.frame(x), family = binomial() );
    x = sign_test
    # preds <- predict(sign_model, newdata=data.frame(sign_test), type = 'response')
    preds <- predict( sign_model, newdata=data.frame(x), type = 'response' )
    preds[ preds>=0.5 ] = 1
    preds[ preds<0.5 ] =0
    return(preds);
#  }
}

pois.mxm <- function(train_target, sign_data, sign_test){
#   if(dim(sign_data)[2] == 1)
#   {
#     return(NULL);
#   }else{
  
    #using this variable x to overcome the structure naming problems when we have just one variable as a sign_data. For more on this contact athineou ;)
    x = sign_data
    # sign_model <- glm(train_target ~ ., data = data.frame(sign_data), family = poisson);
    sign_model <- glm( train_target ~ ., data = data.frame(x), family = poisson() );
    x = sign_test
    # preds <- predict(sign_model, newdata=data.frame(sign_test), type = 'response')
    preds <- predict( sign_model, newdata=data.frame(x), type = 'response' )
    return(preds);
#  }
}


nb.mxm <- function(train_target, sign_data, sign_test){
#   if(dim(sign_data)[2] == 1)
#   {
#     return(NULL);
#   }else{
  
    #using this variable x to overcome the structure naming problems when we have just one variable as a sign_data. For more on this contact athineou ;)
    x = sign_data
    # sign_model <- glm.nb(train_target ~ ., data = data.frame(sign_data));
    sign_model <- glm.nb( train_target ~ ., data = data.frame(x) );
    x = sign_test
    # preds <- predict(sign_model, newdata=data.frame(sign_test), type = 'response')
    preds <- predict( sign_model, newdata=data.frame(x), type = 'response' )
    return(preds);
#  }
}

multinom.mxm <- function(train_target, sign_data, sign_test){
#   if(dim(sign_data)[2] == 1)
#   {
#     return(NULL);
#   }else{
  
    #using this variable x to overcome the structure naming problems when we have just one variable as a sign_data. For more on this contact athineou ;)
    x = sign_data
    # sign_model <- multinom(train_target ~ ., data = data.frame(sign_data), trace = FALSE);
    sign_model <- multinom( train_target ~ ., data = data.frame(x), trace = FALSE );
    x = sign_test
    # preds <- predict(sign_model, newdata=data.frame(sign_test) )
    preds <- predict( sign_model, newdata=data.frame(x) )
    return(preds);
#  }
}

ordinal.mxm <- function(train_target, sign_data, sign_test){
#   if(dim(sign_data)[2] == 1)
#   {
#     return(NULL);
#   }else{
  
    #using this variable x to overcome the structure naming problems when we have just one variable as a sign_data. For more on this contact athineou ;)
    x = sign_data
    # sign_model <- clm(train_target ~ ., data = data.frame(sign_data), trace = FALSE);
    sign_model <- clm( train_target ~ ., data = data.frame(x), trace = FALSE );
    x = sign_test
    # preds <- predict(sign_model, newdata=data.frame(sign_test))
    preds <- predict( sign_model, newdata=data.frame(x) )$fits
    pred <- as.vector( apply(preds, 1, which.max) )
    return(preds);
#  }
}

lm.mxm <- function(train_target, sign_data, sign_test){ ## used for univariate and multivariate target in classical regression
  
#   if(dim(sign_data)[2] == 1)
#   {
#     return(NULL);
#   }else{
  
    #using this variable x to overcome the structure naming problems when we have just one variable as a sign_data. For more on this contact athineou ;)
    x = sign_data
    # sign_model <- lm(train_target ~ ., data = data.frame(sign_data));
    sign_model <- lm( train_target ~ ., data = data.frame(x) );
    x = sign_test
    # preds <- predict(sign_model, newdata=data.frame(sign_test) )
    preds <- predict( sign_model, newdata=data.frame(x) )
    return(preds);
#   }
}

rq.mxm <- function(train_target, sign_data, sign_test){ ## used for univariate and multivariate target in classical regression
  
#   if(dim(sign_data)[2] == 1)
#   {
#     return(NULL);
#   }else{
  
    #using this variable x to overcome the structure naming problems when we have just one variable as a sign_data. For more on this contact athineou ;)
    x = sign_data
    # sign_model <- rq(train_target ~ ., data = data.frame(sign_data) );
    sign_model <- rq( train_target ~ ., data = data.frame(x));
    x = sign_test
    # preds <- predict(sign_model, newdata=data.frame(sign_test) )
    preds <- predict( sign_model, newdata=data.frame(x) )
    return(preds);
#   }
}

rlm.mxm <- function(train_target, sign_data, sign_test){ ## used for univariate and multivariate target in classical regression
  
#   if(dim(sign_data)[2] == 1)
#   {
#     return(NULL);
#   }else{
  
    #using this variable x to overcome the structure naming problems when we have just one variable as a sign_data. For more on this contact athineou ;)
    x = sign_data
    # sign_model <- rlm( train_target ~ ., data = data.frame(sign_data) );
    sign_model <- rlm( train_target ~ ., data = data.frame(x) );
    x = sign_test
    # preds <- predict( sign_model, newdata=data.frame(sign_test) )
    preds <- predict( sign_model, newdata=data.frame(x) )
    return(preds);
#   }
}

beta.mxm <- function(train_target, sign_data, sign_test){ ## used for univariate and multivariate target in classical regression
  
#   if(dim(sign_data)[2] == 1)
#   {
#     return(NULL);
#   }else{
  
    #using this variable x to overcome the structure naming problems when we have just one variable as a sign_data. For more on this contact athineou ;)
    x = sign_data
    # sign_model <- betareg( train_target ~ ., data = data.frame(sign_data) );
    sign_model <- betareg( train_target ~ ., data = data.frame(x) );
    x = sign_test
    # preds <- predict( sign_model, newdata=data.frame(sign_test) )
    preds <- predict( sign_model, newdata=data.frame(x) )
    preds = log( preds / (1 - preds) )  ## logit transformation to make it comparable with the normal regression
    return(preds);
#   }
}


coxph.mxm <- function(train_target, sign_data, sign_test){
  
  #using this variable x to overcome the structure naming problems when we have just one variable as a sign_data. For more on this contact athineou ;)
  x = sign_data
  #sign_model <- coxph(train_target~., data = data.frame(sign_data))
  sign_model <- coxph(train_target~., data = data.frame(x))
  x = sign_test
  #preds <- predict(sign_model, newdata=data.frame(sign_test), type="risk")
  preds <- predict(sign_model, newdata=data.frame(x), type="risk")
  
  return(preds);
}