#' Simulate a single replicate of NR-seq data
#'
#' In `SimulateOneRep`, users have the option to either provide vectors of feature-specific
#' read counts, fraction news, kdegs, and ksyns for the simulation, or to have those drawn
#' from relevant distributions whose properties can be tuned by the various optional
#' parameters of `SimulateOneRep`. The number of mutable nucleotides (nT) in
#' a read is drawn from a binomial distribution with `readlength` trials and a probability
#' of "success" equal to `Ucont`. A read's status as new or old is drawn from a Bernoulli
#' distribution with probability of "success" equal to the feature's fraction new. If a read
#' is new, the number of mutations in the read is drawn from a binomial distribution with
#' probability of mutation equal to pnew. If a read is old, the number of mutations is instead
#' drawn from a binomial distribution with probability of mutation equal to pold.
#'
#' @param nfeatures Number of "features" (e.g., genes) to simulate data for
#' @param read_vect Vector of length = `nfeatures`; specifies the number of reads
#' to be simulated for each feature. If this is not provided, the number of reads
#' simulated is equal to `round(seqdepth * (ksyn_i/kdeg_i)/sum(ksyn/kdeg))`. In other words,
#' the normalized steady-state abundance of a feature is multiplied by the total number
#' of reads to be simulated and rounded to the nearest integer.
#' @param label_time Length of s^4^U feed to simulate.
#' @param sample_name Character vector to assign to `sample` column of output simulated
#' data table (the cB table).
#' @param feature_prefix Name given to the i-th feature is `paste0(feature_prefix, i)`. Shows up in the
#' `feature` column of the output simulated data table.
#' @param fn_vect Vector of length = `nfeatures`; specifies the fraction new to use for each
#' feature's simulation. If this is not provided and `kdeg_vect` is, then `fn_vect = 1 - exp(-kdeg_vect*label_time)`.
#' If both `fn_vect` and `kdeg_vect` are not provided, then kdegs are simulated from a joint distribution as
#' described below and converted to a `fn_vect` as when `kdeg_vect` is user-provided.
#' @param kdeg_vect Vector of length = `nfeatures`; specifies the degradation rate constant to use for each
#' feature's simulation. If this is not provided and `fn_vect` is, then `kdeg_vect = -log(1 - fn_vect)/label_time`.
#' If both `kdeg_vect` and `fn_vect` are not provided, each feature's `kdeg_vect` value is drawn from a log-normal distrubition
#' with meanlog = `logkdeg_mean` and sdlog = `logkdeg_sd`. `kdeg_vect` is actually only simulated in the case
#' where `read_vect` is also not provided, as it will be used to simulate read counts as described above.
#' @param ksyn_vect Vector of length = `nfeatures`; specifies the synthesis rate constant to use for each
#' feature's simulation. If this is not provided, and `read_vect` is also not provided, then each
#' feature's `ksyn_vect` value is drawn from a log-normal distribution with meanlog = `logksyn_mean` and
#' sdlog = `logksyn_sd`. ksyn's do not need to be simulated if `read_vect` is provided, as they only
#' influence read counts.
#' @param pnew Probability that a T is mutated to a C if a read is new.
#' @param pold Probability that a T is mutated to a C if a read is old.
#' @param logkdeg_mean If necessary, meanlog of a log-normal distribution from which
#' kdegs are simulated
#' @param logkdeg_sd If necessary, sdlog of a log-normal distribution from which
#' kdegs are simulated
#' @param logksyn_mean If necessary, meanlog of a log-normal distribution from which
#' ksyns are simulated
#' @param logksyn_sd If necessary, sdlog of a log-normal distribution from which
#' ksyns are simulated
#' @param seqdepth Only relevant if `read_vect` is not provided; in that case, this is
#' the total number of reads to simulate.
#' @param readlength Length of simulated reads. In this simple simulation, all reads
#' are simulated as being exactly this length.
#' @param Ucont_alpha Probability that a nucleotide in a simulated read from a given feature
#' is a U is drawn from a beta distribution with shape1 = `Ucont_alpha`.
#' @param Ucont_beta Probability that a nucleotide in a simulated read from a given feature
#' is a U is drawn from a beta distribution with shape2 = `Ucont_beta`.
#' @param feature_pnew Boolean; if TRUE, simulate a different pnew for each feature
#' @param pnew_kdeg_corr Boolean; only relevant if `feature_pnew` is TRUE. If so, then
#' setting `pnew_kdeg_corr` to TRUE will ensure that higher kdeg transcripts have a higher
#' pnew.
#' @param logit_pnew_mean If `feature_pnew` is TRUE, then the logit(pnew) for each feature
#' will be drawn from a normal distribution with this mean.
#' @param logit_pnew_sd If `feature_pnew` is TRUE, then the logit(pnew) for each feature
#' will be drawn from a normal distribution with this standard deviation.
#' @import data.table
#' @importFrom magrittr %>%
#' @return List with two elements:
#' \itemize{
#'  \item cB: Tibble that can be passed as the `cB` arg to `EZbakRData()`.
#'  \item ground_truth: Tibble containing simulated ground truth.
#' }
#' @examples
#' simdata <- SimulateOneRep(30)
#' @export
SimulateOneRep <- function(nfeatures, read_vect = NULL, label_time = 2,
                           sample_name = "sampleA",
                           feature_prefix = "Gene",
                           fn_vect = NULL, kdeg_vect = NULL, ksyn_vect = NULL,
                           pnew = 0.05, pold = 0.002,
                           logkdeg_mean = -1.9, logkdeg_sd = 0.7,
                           logksyn_mean = 2.3, logksyn_sd = 0.7,
                           seqdepth = nfeatures*2500, readlength = 200,
                           Ucont_alpha = 25, Ucont_beta = 75, feature_pnew = FALSE,
                           pnew_kdeg_corr = FALSE,
                           logit_pnew_mean = -2.5, logit_pnew_sd = 0.1){

  ### Hack to deal with devtools::check() NOTEs
  feature <- TC <- nT <- NULL

  `.` <- list


  ### Check validity of input

  args <- c(as.list(environment()))

  check_SimulateOneRep_input(args)


  ### Simulate feature-specific pnew and pold as necessary

  if(feature_pnew){

    pnew <- inv_logit(stats::rnorm(nfeatures,
                                   logit_pnew_mean,
                                   logit_pnew_sd))

  }



  ### Simulate kinetic parameters as necesary

  # kdeg and fraction new

  if(is.null(fn_vect)){

    if(is.null(kdeg_vect)){

      kdeg_vect <- stats::rlnorm(nfeatures,
                          logkdeg_mean,
                          logkdeg_sd)


    }

    fn_vect <- 1 - exp(-kdeg_vect*label_time)


  }

  # read counts

  if(is.null(read_vect)){

    if(is.null(ksyn_vect)){

      ksyn_vect <- stats::rlnorm(nfeatures,
                          logksyn_mean,
                          logksyn_sd)

    }

    if(is.null(kdeg_vect)){

      kdeg_vect <- -log(1 - fn_vect)/label_time

    }

    read_vect <- round(((ksyn_vect/kdeg_vect)/sum(ksyn_vect/kdeg_vect))*seqdepth)

  }

  if(length(read_vect) == 1 & nfeatures > 1){

    read_vect <- rep(read_vect, times = nfeatures)

  }


  ### Simulate mutational data

  totreads <- sum(read_vect)

  read_status <- stats::rbinom(n = totreads,
                        size = 1,
                        prob = rep(fn_vect, times = read_vect))

  # Simulate feature-specific U-contents
  Ucont <- stats::rbeta(
    n = nfeatures,
    shape1 = Ucont_alpha,
    shape2 = Ucont_beta
  )

  nT_count <- stats::rbinom(n = totreads,
                     size = readlength,
                     prob = rep(Ucont,
                                times = read_vect))

  if(feature_pnew){

    if(pnew_kdeg_corr){

      pnew <- pnew[order(pnew)]

      TC_count <- stats::rbinom(n = totreads,
                                size = nT_count,
                                prob = read_status*rep(pnew[rank(kdeg_vect)], times = read_vect) + (1 - read_status)*pold)


    }else{

      TC_count <- stats::rbinom(n = totreads,
                                size = nT_count,
                                prob = read_status*rep(pnew, times = read_vect) + (1 - read_status)*pold)


    }



  }else{

    TC_count <- stats::rbinom(n = totreads,
                              size = nT_count,
                              prob = read_status*pnew + (1 - read_status)*pold)

  }


  cB <- data.table::data.table(
    sample = sample_name,
    feature = rep(paste0(feature_prefix, 1:nfeatures),
                  times = read_vect),
    TC = TC_count,
    nT = nT_count
  )[,.(n = .N), by = .(sample, feature, TC, nT)]


  ### Save ground truth

  if(feature_pnew){

    if(pnew_kdeg_corr){

      truth <- data.table::data.table(sample = sample_name,
                                      feature = paste0(feature_prefix, 1:nfeatures),
                                      true_fraction_highTC = fn_vect,
                                      true_kdeg = kdeg_vect,
                                      true_ksyn = ksyn_vect,
                                      true_pnew = pnew[rank(kdeg_vect)])

    }else{

      truth <- data.table::data.table(sample = sample_name,
                                      feature = paste0(feature_prefix, 1:nfeatures),
                                      true_fraction_highTC = fn_vect,
                                      true_kdeg = kdeg_vect,
                                      true_ksyn = ksyn_vect,
                                      true_pnew = pnew)

    }


  }else{

    truth <- data.table::data.table(sample = sample_name,
                                    feature = paste0(feature_prefix, 1:nfeatures),
                                    true_fraction_highTC = fn_vect,
                                    true_kdeg = kdeg_vect,
                                    true_ksyn = ksyn_vect)

  }




  return(list(cB = cB,
              ground_truth = truth))



}


# Dirichlet distribution RNG
# Source: LaplacesDemon package (https://github.com/LaplacesDemonR/LaplacesDemon/blob/de9107d46c215a9db57ad6e9c95a9ebcaf75ef25/R/distributions.R#L327)
rdirichlet <- function (n, alpha) {

  alpha <- rbind(alpha)
  alpha.dim <- dim(alpha)
  if(n > alpha.dim[1])
    alpha <- matrix(alpha, n, alpha.dim[2], byrow=TRUE)
  x <- matrix(stats::rgamma(alpha.dim[2]*n, alpha), ncol=alpha.dim[2])
  sm <- x %*% rep(1, alpha.dim[2])

  return(x/as.vector(sm))
}

#' Vectorized simulation of one replicate of multi-label NR-seq data
#'
#' Generalizes SimulateOneRep() to simulate any combination of mutation types.
#' Currently, no kinetic model is used to relate certain parameters to the
#' fractions of reads belonging to each simulated mutational population. Instead
#' these fractions are drawn from a Dirichlet distribution with gene-specific
#' parameters.
#'
#' @param nfeatures Number of "features" (e.g., genes) to simulate data for
#' @param populations Vector of mutation populations you want to simulate.
#' @param fraction_design Fraction design matrix, specifying which potential mutational
#' populations should actually exist. See ?EstimateFractions for more details.
#' @param fractions_matrix Matrix of fractions of each mutational population to simulate.
#' If not provided, this will be simulated. One row for each feature, one column for each
#' mutational population, rows should sum to 1.
#' @param read_vect Vector of length = `nfeatures`; specifies the number of reads
#' to be simulated for each feature. If this is not provided, the number of reads
#' simulated is equal to `round(seqdepth * (ksyn_i/kdeg_i)/sum(ksyn/kdeg))`. In other words,
#' the normalized steady-state abundance of a feature is multiplied by the total number
#' of reads to be simulated and rounded to the nearest integer.
#' @param sample_name Character vector to assign to `sample` column of output simulated
#' data table (the cB table).
#' @param feature_prefix Name given to the i-th feature is `paste0(feature_prefix, i)`. Shows up in the
#' `feature` column of the output simulated data table.
#' @param kdeg_vect Vector of length = `nfeatures`; specifies the degradation rate constant to use for each
#' feature's simulation. If this is not provided and `fn_vect` is, then `kdeg_vect = -log(1 - fn_vect)/label_time`.
#' If both `kdeg_vect` and `fn_vect` are not provided, each feature's `kdeg_vect` value is drawn from a log-normal distrubition
#' with meanlog = `logkdeg_mean` and sdlog = `logkdeg_sd`. `kdeg_vect` is actually only simulated in the case
#' where `read_vect` is also not provided, as it will be used to simulate read counts as described above.
#' @param ksyn_vect Vector of length = `nfeatures`; specifies the synthesis rate constant to use for each
#' feature's simulation. If this is not provided, and `read_vect` is also not provided, then each
#' feature's `ksyn_vect` value is drawn from a log-normal distribution with meanlog = `logksyn_mean` and
#' sdlog = `logksyn_sd`. ksyn's do not need to be simulated if `read_vect` is provided, as they only
#' influence read counts.
#' @param phighs Vector of probabilities of mutation rates in labeled reads of each type denoted in
#' `populations`. Should be a named vector, with names being the corresponding `population`.
#' @param plows Vector of probabilities of mutation rates in unlabeled reads of each type denoted in
#' `populations`. Should be a named vector, with names being the corresponding `population`.
#' @param logkdeg_mean If necessary, meanlog of a log-normal distribution from which
#' kdegs are simulated
#' @param logkdeg_sd If necessary, sdlog of a log-normal distribution from which
#' kdegs are simulated
#' @param logksyn_mean If necessary, meanlog of a log-normal distribution from which
#' ksyns are simulated
#' @param logksyn_sd If necessary, sdlog of a log-normal distribution from which
#' ksyns are simulated
#' @param seqdepth Only relevant if `read_vect` is not provided; in that case, this is
#' the total number of reads to simulate.
#' @param readlength Length of simulated reads. In this simple simulation, all reads
#' are simulated as being exactly this length.
#' @param alpha_min Minimum possible value of alpha element of Dirichlet random variable
#' @param alpha_max Maximum possible value of alpha element of Dirichlet random variable
#' @param Ucont Probability that a nucleotide in a simulated read is a U.
#' @param Acont Probability that a nucleotide in a simulated read is an A.
#' @param Gcont Probability that a nucleotide in a simulated read is a G.
#' @param Ccont Probability that a nucleotide in a simulated read is a C.
#' @importFrom magrittr %>%
#' @return List with two elements:
#' \itemize{
#'  \item cB: Tibble that can be passed as the `cB` arg to `EZbakRData()`.
#'  \item ground_truth: Tibble containing simulated ground truth.
#' }
#' @examples
#' simdata <- VectSimulateMultiLabel(30)
#' @export
VectSimulateMultiLabel <- function(nfeatures, populations = c("TC"),
                                   fraction_design = create_fraction_design(populations),
                                   fractions_matrix = NULL, read_vect = NULL,
                                   sample_name = "sampleA", feature_prefix = "Gene",
                                   kdeg_vect = NULL, ksyn_vect = NULL,
                                   logkdeg_mean = -1.9, logkdeg_sd = 0.7,
                                   logksyn_mean = 2.3, logksyn_sd = 0.7,
                                   phighs = stats::setNames(rep(0.05, times = length(populations)), populations),
                                   plows = stats::setNames(rep(0.002, times = length(populations)), populations),
                                   seqdepth = nfeatures*2500, readlength = 200,
                                   alpha_min = 3, alpha_max = 6,
                                   Ucont = 0.25, Acont = 0.25, Gcont = 0.25, Ccont = 0.25){


  # Hack to deal with devtools::check() NOTEs
  present <- NULL

  npops <- sum(fraction_design$present)

  # Infer fractions of each mutation type for each feature
  if(is.null(fractions_matrix)){

    # Feature-specific Dirichlet alphas
    alphas <- matrix(stats::runif(npops*nfeatures, min = 3, max = 6),
                     nrow = nfeatures, ncol = npops,
                     byrow = TRUE)

    # Feature-specific fractions of each mutational population
    fractions_matrix <- matrix(0,
                               nrow = nfeatures,
                               ncol = npops)
    for(f in 1:nfeatures){

      fractions_matrix[f,] <- rdirichlet(1, alphas[f,])

    }


  }

  # Infer read counts
  if(is.null(read_vect)){

    if(is.null(ksyn_vect)){

      ksyn_vect <- stats::rlnorm(nfeatures,
                                 logksyn_mean,
                                 logksyn_sd)

    }

    if(is.null(kdeg_vect)){

      kdeg_vect <- stats::rlnorm(nfeatures,
                                 logkdeg_mean,
                                 logkdeg_sd)


    }

    read_vect <- round(((ksyn_vect/kdeg_vect)/sum(ksyn_vect/kdeg_vect))*seqdepth)

  }


  ### Need to figure out which nucleotide types to simulate
  nuc_types <- substr(populations, start = 1, stop = 1)
  nuc_cnt_names <- paste0("n", nuc_types)


  # What is vector of nucleotide content probabilities
  nuc_probs <- c(Ucont, Acont, Gcont, Ccont)
  names(nuc_probs) <- c("nT", "nA", "nG", "nC")
  nuc_probs_sim <- nuc_probs[names(nuc_probs) %in% nuc_cnt_names]
  nuc_probs_sim <- c(nuc_probs_sim,
                     1 - sum(nuc_probs_sim))


  ### Simulate read status in each population
  type_cnts <- matrix(0,
                      nrow = nfeatures,
                      ncol = npops)

  fraction_design_present <- fraction_design %>%
    dplyr::filter(present)


  fraction_reads <- vector(mode = "list", length = ncol(fractions_matrix))
  for(i in 1:ncol(fractions_matrix)){
    fraction_reads[[i]] <- ceiling(read_vect*fractions_matrix[,i])
  }

  tot_reads <- sum(unlist(fraction_reads))
  reads_per_fraction <- sapply(fraction_reads, sum) %>% unname()


  # Simualte nucleotide counts
  nt_cnts <- stats::rmultinom(tot_reads, size = readlength,
                              prob = nuc_probs_sim)

  ### Simulate nucleotide contents and mutations

  mutations <- vector(mode = "list",
                      length = length(populations))
  nucleotides <- vector(mode = "list",
                        length = length(populations))
  names(mutations) <- populations
  names(nucleotides) <- nuc_cnt_names
  for(p in seq_along(populations)){

    pvect <- rep(0, times = nrow(fraction_design_present))
    for(r in 1:(nrow(fraction_design_present))){

      if(as.logical(fraction_design_present[r,populations[p]])){

        pvect[r] <- phighs[[populations[p]]]

      }else{

        pvect[r] <- plows[[populations[p]]]

      }
    }

    mutations[[populations[p]]] <- stats::rbinom(tot_reads,
                                          size = nt_cnts[p,],
                                          prob = rep(pvect,
                                                     times = reads_per_fraction))

    nucleotides[[nuc_cnt_names[p]]] <- nt_cnts[p,]

  }

  ### Figure out feature assignment
  feature_vect <- c()
  for(f in fraction_reads){

    feature_vect <- c(feature_vect,
                      rep(1:nfeatures,
                          times = f))

  }
  feature_vect <- paste0(feature_prefix, feature_vect)


  cB <- dplyr::bind_cols(list(
    dplyr::tibble(feature = feature_vect),
    dplyr::as_tibble(mutations),
    dplyr::as_tibble(nucleotides))) %>%
    dplyr::mutate(sample = sample_name)


  ### What do I call each things name

  nvect <- rep("", times = nrow(fraction_design_present))
  for(r in 1:nrow(fraction_design_present)){

    nvect[r] <- "true_fraction"
    for(t in 1:(ncol(fraction_design_present)-1)){

      if(as.logical(fraction_design_present[r,t])){

        nvect[r] <- paste0(nvect[r], "_high", colnames(fraction_design_present)[t])

      }else{

        nvect[r] <- paste0(nvect[r], "_low", colnames(fraction_design_present)[t])

      }

    }

  }

  colnames(fractions_matrix) <- nvect
  ground_truth <- dplyr::as_tibble(fractions_matrix)
  ground_truth$feature <- paste0(feature_prefix, 1:nrow(ground_truth))

  cB <- cB %>%
    dplyr::group_by(dplyr::across(dplyr::all_of(colnames(cB)))) %>%
    dplyr::count()

  return(list(cB = cB,
              ground_truth = ground_truth))


}



#' Simulate one replicate of multi-label NR-seq data
#'
#' Generalizes SimulateOneRep() to simulate any combination of mutation types.
#' Currently, no kinetic model is used to relate certain parameters to the
#' fractions of reads belonging to each simulated mutational population. Instead
#' these fractions are drawn from a Dirichlet distribution with gene-specific
#' parameters.
#'
#' @param nfeatures Number of "features" (e.g., genes) to simulate data for
#' @param populations Vector of mutation populations you want to simulate.
#' @param fraction_design Fraction design matrix, specifying which potential mutational
#' populations should actually exist. See ?EstimateFractions for more details.
#' @param fractions_matrix Matrix of fractions of each mutational population to simulate.
#' If not provided, this will be simulated. One row for each feature, one column for each
#' mutational population, rows should sum to 1.
#' @param read_vect Vector of length = `nfeatures`; specifies the number of reads
#' to be simulated for each feature. If this is not provided, the number of reads
#' simulated is equal to `round(seqdepth * (ksyn_i/kdeg_i)/sum(ksyn/kdeg))`. In other words,
#' the normalized steady-state abundance of a feature is multiplied by the total number
#' of reads to be simulated and rounded to the nearest integer.
#' @param sample_name Character vector to assign to `sample` column of output simulated
#' data table (the cB table).
#' @param feature_prefix Name given to the i-th feature is `paste0(feature_prefix, i)`. Shows up in the
#' `feature` column of the output simulated data table.
#' @param kdeg_vect Vector of length = `nfeatures`; specifies the degradation rate constant to use for each
#' feature's simulation. If this is not provided and `fn_vect` is, then `kdeg_vect = -log(1 - fn_vect)/label_time`.
#' If both `kdeg_vect` and `fn_vect` are not provided, each feature's `kdeg_vect` value is drawn from a log-normal distrubition
#' with meanlog = `logkdeg_mean` and sdlog = `logkdeg_sd`. `kdeg_vect` is actually only simulated in the case
#' where `read_vect` is also not provided, as it will be used to simulate read counts as described above.
#' @param ksyn_vect Vector of length = `nfeatures`; specifies the synthesis rate constant to use for each
#' feature's simulation. If this is not provided, and `read_vect` is also not provided, then each
#' feature's `ksyn_vect` value is drawn from a log-normal distribution with meanlog = `logksyn_mean` and
#' sdlog = `logksyn_sd`. ksyn's do not need to be simulated if `read_vect` is provided, as they only
#' influence read counts.
#' @param phighs Vector of probabilities of mutation rates in labeled reads of each type denoted in
#' `populations`. Should be a named vector, with names being the corresponding `population`.
#' @param plows Vector of probabilities of mutation rates in unlabeled reads of each type denoted in
#' `populations`. Should be a named vector, with names being the corresponding `population`.
#' @param logkdeg_mean If necessary, meanlog of a log-normal distribution from which
#' kdegs are simulated
#' @param logkdeg_sd If necessary, sdlog of a log-normal distribution from which
#' kdegs are simulated
#' @param logksyn_mean If necessary, meanlog of a log-normal distribution from which
#' ksyns are simulated
#' @param logksyn_sd If necessary, sdlog of a log-normal distribution from which
#' ksyns are simulated
#' @param seqdepth Only relevant if `read_vect` is not provided; in that case, this is
#' the total number of reads to simulate.
#' @param readlength Length of simulated reads. In this simple simulation, all reads
#' are simulated as being exactly this length.
#' @param alpha_min Minimum possible value of alpha element of Dirichlet random variable
#' @param alpha_max Maximum possible value of alpha element of Dirichlet random variable
#' @param Ucont Probability that a nucleotide in a simulated read is a U.
#' @param Acont Probability that a nucleotide in a simulated read is an A.
#' @param Gcont Probability that a nucleotide in a simulated read is a G.
#' @param Ccont Probability that a nucleotide in a simulated read is a C.
#' @importFrom magrittr %>%
#' @return List with two elements:
#' \itemize{
#'  \item cB: Tibble that can be passed as the `cB` arg to `EZbakRData()`.
#'  \item ground_truth: Tibble containing simulated ground truth.
#' }
#' @examples
#' simdata <- SimulateMultiLabel(3)
#' @export
SimulateMultiLabel <- function(nfeatures, populations = c("TC"),
                               fraction_design = create_fraction_design(populations),
                               fractions_matrix = NULL, read_vect = NULL,
                               sample_name = "sampleA", feature_prefix = "Gene",
                               kdeg_vect = NULL, ksyn_vect = NULL,
                               logkdeg_mean = -1.9, logkdeg_sd = 0.7,
                               logksyn_mean = 2.3, logksyn_sd = 0.7,
                               phighs = stats::setNames(rep(0.05, times = length(populations)), populations),
                               plows = stats::setNames(rep(0.002, times = length(populations)), populations),
                               seqdepth = nfeatures*2500, readlength = 200,
                               alpha_min = 3, alpha_max = 6,
                               Ucont = 0.25, Acont = 0.25, Gcont = 0.25, Ccont = 0.25){


  # Hacks to deal with devtools::check()
  present <- NULL


  npops <- sum(fraction_design$present)

  # Infer fractions of each mutation type for each feature
  if(is.null(fractions_matrix)){

    # Feature-specific Dirichlet alphas
    alphas <- matrix(stats::runif(npops*nfeatures, min = 3, max = 6),
                     nrow = nfeatures, ncol = npops,
                     byrow = TRUE)

    # Feature-specific fractions of each mutational population
    fractions_matrix <- matrix(0,
                            nrow = nfeatures,
                            ncol = npops)
    for(f in 1:nfeatures){

      fractions_matrix[f,] <- rdirichlet(1, alphas[f,])

    }


  }

  # Infer read counts
  if(is.null(read_vect)){

    if(is.null(ksyn_vect)){

      ksyn_vect <- stats::rlnorm(nfeatures,
                                 logksyn_mean,
                                 logksyn_sd)

    }

    if(is.null(kdeg_vect)){

      kdeg_vect <- stats::rlnorm(nfeatures,
                                 logkdeg_mean,
                                 logkdeg_sd)


    }

    read_vect <- round(((ksyn_vect/kdeg_vect)/sum(ksyn_vect/kdeg_vect))*seqdepth)

  }


  ### Need to figure out which nucleotide types to simulate
  nuc_types <- substr(populations, start = 1, stop = 1)
  nuc_cnt_names <- paste0("n", nuc_types)


  # What is vector of nucleotide content probabilities
  nuc_probs <- c(Ucont, Acont, Gcont, Ccont)
  names(nuc_probs) <- c("nT", "nA", "nG", "nC")
  nuc_probs_sim <- nuc_probs[names(nuc_probs) %in% nuc_cnt_names]
  nuc_probs_sim <- c(nuc_probs_sim,
                     1 - sum(nuc_probs_sim))


  ### Simulate read status in each population
  type_cnts <- matrix(0,
                      nrow = nfeatures,
                      ncol = npops)

  fraction_design_present <- fraction_design %>%
    dplyr::filter(present)


  muts_list <- vector(mode = "list",
                      length = length(populations))
  names(muts_list) <- populations
  muts_final <- dplyr::tibble()
  for(f in 1:nfeatures){

    # Simulate mutational population status
    type_cnts <- stats::rmultinom(1, size = read_vect[f],
                           prob = fractions_matrix[f,])

    # Simualte nucleotide counts
    nt_cnts <- stats::rmultinom(read_vect[f], size = readlength,
                         prob = nuc_probs_sim)


    muts_df <- dplyr::tibble()
    # For each population, simulate counts for all mutation types
    for(t in 1:npops){
      if(t == 1){
        start <- 1
      }else{
        start <- sum(type_cnts[1:(t-1)]) + 1
      }
      end <- sum(type_cnts[1:t])

      population_details <- fraction_design_present[t,] %>%
        dplyr::select(-present) %>%
        unlist()

      reads <- type_cnts[t,1]


      for(m in seq_along(populations)){

        muts_list[[populations[m]]] <-  stats::rbinom(reads,
                                        size = nt_cnts[m,],
                                        prob = phighs[[populations[m]]]*population_details[[populations[m]]] +
                                          plows[[populations[m]]]*(!population_details[[populations[m]]]))


      }

      muts_temp <- dplyr::bind_cols(dplyr::as_tibble(muts_list),
                             dplyr::as_tibble(t(nt_cnts[1:length(nuc_types), start:end])))
      muts_df <- dplyr::bind_rows(muts_df, muts_temp)

    }

    muts_df <- muts_df %>%
      dplyr::mutate(feature = rep(paste0(feature_prefix, f), times = nrow(muts_df)))


    muts_final <- dplyr::bind_rows(muts_final, muts_df)


  }


  muts_final <- muts_final %>%
    dplyr::mutate(sample = sample_name)


  ### What do I call each things name

  nvect <- rep("", times = nrow(fraction_design_present))
  for(r in 1:nrow(fraction_design_present)){

    nvect[r] <- "true_fraction"
    for(t in 1:(ncol(fraction_design_present)-1)){

      if(as.logical(fraction_design_present[r,t])){

        nvect[r] <- paste0(nvect[r], "_high", colnames(fraction_design_present)[t])

      }else{

        nvect[r] <- paste0(nvect[r], "_low", colnames(fraction_design_present)[t])

      }

    }

  }

  colnames(fractions_matrix) <- nvect
  ground_truth <- dplyr::as_tibble(fractions_matrix)
  ground_truth$feature <- paste0(feature_prefix, 1:nrow(ground_truth))

  cB <- muts_final %>%
    dplyr::group_by(dplyr::across(dplyr::all_of(colnames(muts_final)))) %>%
    dplyr::count()

  return(list(cB = cB,
              ground_truth = ground_truth))

}




#' Simulate NR-seq data for multiple replicates of multiple biological conditions
#'
#' `EZSimulate()` is a user friendly wrapper to `SimulateMultiCondition()`. It
#' sets convenient defaults so as to quickly generate easy to interpret output.
#' `EZSimulate()` has all of the same parameters as `SimulateMultiCondition()`,
#' but it also has a number of additional parameters that guide its default behavior
#' and allow you to simulate multi-condition data without specifying the multiple,
#' sometimes complex, arguments that you would need to specify in `SimulateMultiCondition()`
#' to get the same behavior. In particular, users only have to set a single parameter,
#' `nfeatures` (number of features to simulate data for), by default. The `EZSimulate()`-unique
#' parameters `ntreatments` and `nreps` have default values that guide the simulation in the
#' case where only `nfeatures` is specified. In particular, `nreps` of `ntreatments` different
#' conditions will be simulated, with the assumed model `log(kdeg) ~ treatment` and `log(ksyn) ~ 1`.
#' In other words, Different kdeg values will be simulated for each treatment level, and ksyn
#' values will not differ across conditions.
#'
#' @param nfeatures Number of "features" (e.g., genes) for which to simulated data.
#' @param mode Currently, EZSimulate can simulate in two modes: "standard" and "dynamics".
#' The former is the default and involves simulating multiple conditions of standard NR-seq data.
#' "dynamics" calls `SimulateDynamics()` under the hood to simulate a dynamical systems
#' model of your choice. Most of the additional parameters do not apply if mode == "dynamics",
#' except for those from dynamics_preset and on.
#' @param ntreatments Number of distinct treatments to simulate. This parameter is
#' only relevant if `metadf` is not provided.
#' @param nreps Number of replicates of each treatment to simulate. This parameter is
#' only relevant if `metadf` is not provided
#' @param nctlreps Number of -s4U replicates of each treatment to simulate. This parameter
#' is only relevant if `metadf` is not provided.
#' @param metadf A data frame with the following columns:
#' \itemize{
#'  \item sample: Names given to samples to simulate.
#'  \item \code{<details>}: Any number of columns with any names (not taken by other metadf columns)
#'  storing factors by which the samples can be stratified. These can be referenced
#'  in `mean_formula`, described below.
#' }
#' These parameters (described more below) can also be included in metadf to specify sample-specific simulation
#' parameter:
#' \itemize{
#'  \item seqdepth
#'  \item label_time
#'  \item pnew
#'  \item pold
#'  \item readlength
#'  \item Ucont
#' }
#' @param mean_formula A formula object that specifies the linear model used to
#' relate the factors in the \code{<details>} columns of `metadf` to average log(kdegs) and
#' log(ksyns) in each sample.
#' @param param_details A data frame with one row for each column of the design matrix
#' obtained from `model.matrix(mean_formula, metadf)` that describes how to simulate
#' the linear model parameters. The columns of this data frame are:
#' \itemize{
#'  \item param: Name of linear model parameter as it appears in the column names of the
#'  design matrix from `model.matrix(mean_formula, metadf)`.
#'  \item reference: Boolean; TRUE if you want to treat that parameter as a "reference". This
#'  means that all other parameter values that aren't global parameters are set equal to this
#'  unless otherwise determined (see `pdiff_*` parameters for how it is determined if a parameter
#'  will differ from the reference).
#'  \item global: Boolean; TRUE if you want to treat that parameter as a global parameter. This means
#'  that a single value is used for all features.
#'  \item logkdeg_mean: If parameter is the reference, then its value for the log(kdeg) linear model
#'  will be drawn from a normal distribution with this mean. If it is a global parameter, then this
#'  value will be used. If it is neither of these, then its value in the log(kdeg) linear model will
#'  either be the reference (if there is no difference between this condition's value and the reference)
#'  or the reference's value + a normally distributed random variable centered on this value.
#'  \item logkdeg_sd: sd used for draws from normal distribution as described for `logkdeg_mean`.
#'  \item logksyn_mean: Same as `logkdeg_mean` but for log(ksyn) linear model.
#'  \item logksyn_sd: Same as `logkdeg_sd` but for log(kdeg) linear model.
#'  \item pdiff_ks: Proportion of features whose value of this parameter in the log(ksyn) linear model
#'  will differ from the reference's. Should be a number between 0 and 1, inclusive. For example, if
#'  `pdiff_ks` is 0.1, then for 10% of features, this parameter will equal the reference parameter +
#'  a normally distributed random variable with mean `logksyn_mean` and sd `logksyn_sd`. For the other
#'  90% of features, this parameter will equal the reference.
#'  \item pdiff_kd: Same as `pdiff_ks` but for log(kdeg) linear model.
#'  \item pdiff_both: Proportion of features whose value for this parameter in BOTH the
#'  log(kdeg) and log(ksyn) linear models will differ from the reference. Value must be
#'  between 0 and min(c(pdiff_kd, pdiff_ks)) in that row.
#' }
#' If param_details is not specified by the user, the first column of the design matrix
#' is assumed to represent the reference parameter, all parameters are assumed to be
#' non-global, logkdeg_mean and logksyn_mean are set to the equivalently named parameter values
#' described below for the reference and `logkdeg_diff_avg` and `logksyn_diff_avg` for all other parameters,
#' logkdeg_sd and logksyn_sd are set to the equivalently named parameter values
#' described below for the reference and `logkdeg_diff_sd` and `logksyn_diff_sd` for all other parameters,
#' and pdiff_kd, pdiff_ks, and pdiff_both are all set to the equivalently named parameter values.
#' @param seqdepth Total number of reads in each sample.
#' @param label_time Length of s^4^U feed to simulate.
#' @param pnew Probability that a T is mutated to a C if a read is new.
#' @param pold Probability that a T is mutated to a C if a read is old.
#' @param readlength Length of simulated reads. In this simple simulation, all reads
#' are simulated as being exactly this length.
#' @param Ucont_alpha Probability that a nucleotide in a simulated read from a given feature
#' is a U is drawn from a beta distribution with shape1 = `Ucont_alpha`.
#' @param Ucont_beta Probability that a nucleotide in a simulated read from a given feature
#' is a U is drawn from a beta distribution with shape2 = `Ucont_beta`.
#' @param logkdeg_mean Mean of normal distribution from which reference log(kdeg)
#' linear model parameter is drawn from for each feature if `param_details` is not provided.
#' @param feature_prefix Name given to the i-th feature is `paste0(feature_prefix, i)`. Shows up in the
#' `feature` column of the output simulated data table.
#' @param dispslope Negative binomial dispersion parameter "slope" with respect to read counts. See
#' DESeq2 paper for dispersion model used.
#' @param dispint Negative binomial dispersion parameter "intercept" with respect to read counts. See
#' DESeq2 paper for dispersion model used.
#' @param logkdegsdtrend_slope Slope for log10(read count) vs. log(kdeg) replicate variability trend
#' @param logkdegsdtrend_intercept Intercept for log10(read count) vs. log(kdeg) replicate variability trend
#' @param logksynsdtrend_slope Slope for log10(read count) vs. log(ksyn) replicate variability trend
#' @param logksynsdtrend_intercept Intercept for log10(read count) vs. log(ksyn) replicate variability trend
#' @param logkdeg_sd Standard deviation of normal distribution from which reference log(kdeg)
#' linear model parameter is drawn from for each feature if `param_details` is not provided.
#' @param logksyn_mean Mean of normal distribution from which reference log(ksyn)
#' linear model parameter is drawn from for each feature if `param_details` is not provided.
#' @param logksyn_sd Standard deviation of normal distribution from which reference log(ksyn)
#' linear model parameter is drawn from for each feature if `param_details` is not provided.
#' @param logkdeg_diff_avg Mean of normal distribution from which non-reference log(kdeg)
#' linear model parameters are drawn from for each feature if `param_details` is not provided.
#' @param logkdeg_diff_sd Standard deviation of normal distribution from which reference log(kdeg)
#' linear model parameter are drawn from for each feature if `param_details` is not provided.
#' @param logksyn_diff_avg Mean of normal distribution from which reference log(ksyn)
#' linear model parameter are drawn from for each feature if `param_details` is not provided.
#' @param logksyn_diff_sd Standard deviation of normal distribution from which reference log(ksyn)
#' linear model parameter are drawn from for each feature if `param_details` is not provided.
#' @param pdiff_kd Proportion of features for which non-reference log(kdeg) linear model parameters
#' differ from the reference.
#' @param pdiff_ks Proportion of features for which non-reference log(ksyn) linear model parameters
#' differ from the reference.
#' @param pdiff_both Proportion of features for which BOTH non-reference log(kdeg) and log(ksyn) linear model parameters
#' differ from the reference.
#' @param pdo Dropout rate; think of this as the probability that a s4U containing
#' molecule is lost during library preparation and sequencing. If `pdo` is 0 (default)
#' then there is not dropout.
#' @param dynamics_preset Which preset model to use for simulation of dynamics.
#' Therefore, only relevant if `mode` == `dynamics`. Options are:
#' \describe{
#'  \item{nuc2cyto}{Simplest model of nuclear and cytoplasmic RNA dynamics: 0 -> N -> C -> 0}
#'  \item{preRNA}{Simplest model of pre-RNA and mature RNA dynamics: 0 -> P -> M -> 0}
#'  \item{preRNAwithPdeg}{Same as preRNA, but now pre-RNA can also degrade.}
#'  \item{nuc2cytowithNdeg}{Same as nuc2cyto, but now nuclear RNA can also degrade.}
#'  \item{subtlseq}{Subcellular TimeLapse-seq model, similar to that described in Ietswaart et al., 2024.
#'  Simplest model discussed there, lacking nuclear degradation: 0 -> CH -> NP -> CY -> PL -> 0, and CY can
#'  also degrade.}
#'  \item{nuc2cytowithpreRNA}{Combination of nuc2cyto and preRNA where preRNA is first synthesized,
#'  then either processed or exported to the cytoplasm. Processing can also occur in the cytoplasm, and
#'  mature nuclear RNA can be exported to the cytoplasm. Only  mature RNA degrades.}
#' }
#' @param unassigned_name String to give to reads not assigned to a given feature.
#' @param dispersion Negative binomial `size` parameter to use for simulating read counts
#' @param lfn_sd Logit(fn) replicate variability.
#' @param log_means Vector of log-Normal logmeans from which the distribution of
#' feature-specific parameters will be drawn from. Length of vector should be the same
#' as max(entries in `graph`), i.e., the number of parameters in your specified model.
#' If not provided, will by default be `c(1, seq(from = -0.3, to = -2.5, length.out = max(graph) - 1 ))`.
#' `1` for the ksyn parameter (which is always denoted 1 in the preset `graph`) is arbitrary.
#' Remaining parameters will make it so indices order parameters from fastest to slowest process.
#' @param log_sds Vector of log-Normal logsds from which the distribution of
#' feature-specific parameters will be drawn from. If not provided, will be 0.4 for all parameters.
#' @param treatment_effects Data frame describing effects of treatment on each
#' parameter. Should have five columns: "parameter_index", "treatment_index", "mean", "sd",
#' and "fraction_affected".
#' Each row corresponds to the effect the ith (i = treatment_index) treatment has on the
#' jth (j = parameter_index) kinetic parameter. Effect sizes, on a log-scale, are drawn from
#' a Normal distribution with mean and standard deviation set by the mean and sd columns,
#' respectively. The number of non-zero effects is set by "fraction_affected", and is
#' equal to `ceiling(nfeatures * fraction_affected)`. treatment_index of 1 will be ignored
#' and can either be included or not.
#' @param effect_avg_default If `ntreatments` > 1, and `treatment_effects` is not
#' provided, this will be the value of `mean` for all treatments and parameters imputed
#' in `treatment_effects`.
#' @param effect_sd_default If `ntreatments` > 1, and `treatment_effects` is not
#' provided, this will be the value of `sd` for all treatments and parameters imputed
#' in `treatment_effects`.
#' @param fraction_affected_default If `ntreatments` > 1, and `treatment_effects` is not
#' provided, this will be the value of `fraction_affected` for all treatments and parameters imputed
#' in `treatment_effects`.
#' @import data.table
#' @importFrom magrittr %>%
#' @return A list containing 5 elements:
#' \itemize{
#'  \item cB: Tibble that can be provided as the `cB` arg to `EZbakRData()`.
#'  \item metadf: Tibble that can be provided as the `metadf` arg to `EZbakRData()`.
#'  \item PerRepTruth: Tibble containing replicate-by-replicate simulated ground truth
#'  \item AvgTruth: Tibble containing average simulated ground truth
#'  \item param_details: Tibble containing information about simulated linear model parameters
#' }
#' @examples
#'
#' # Simulate standard data
#' simdata_standard <- EZSimulate(30)
#'
#' # Simulate dynamical systems data
#' simdata_ode <- EZSimulate(30,
#'                           mode = "dynamics",
#'                           ntreatments = 1,
#'                           label_time = c(1, 3),
#'                           dynamics_preset = "nuc2cyto")
#'
#' @export
EZSimulate <- function(nfeatures,
                       mode = c("standard", "dynamics"),
                       ntreatments = ifelse(mode == "standard", 2, 1), nreps = 3, nctlreps = 1,
                       metadf = NULL,
                       mean_formula = NULL,
                       param_details = NULL,
                       seqdepth = nfeatures*2500, label_time = 2,
                       pnew = 0.05, pold = 0.001,
                       readlength = 200, Ucont_alpha = 25,
                       Ucont_beta = 75,
                       feature_prefix = "Gene",
                       dispslope = 5, dispint = 0.01,
                       logkdegsdtrend_slope = -0.3,
                       logkdegsdtrend_intercept = -2.25,
                       logksynsdtrend_slope = -0.3,
                       logksynsdtrend_intercept = -2.25,
                       logkdeg_mean = -1.9, logkdeg_sd = 0.7,
                       logksyn_mean = 2.3, logksyn_sd = 0.7,
                       logkdeg_diff_avg = 0, logksyn_diff_avg = 0,
                       logkdeg_diff_sd = 0.5, logksyn_diff_sd = 0.5,
                       pdiff_kd = 0.1, pdiff_ks = 0, pdiff_both = 0,
                       pdo = 0,
                       dynamics_preset = c("preRNA", "nuc2cyto",
                                           "preRNAwithPdeg", "nuc2cytowithNdeg",
                                           "subtlseq", "nuc2cytowithpreRNA"),
                       unassigned_name = "__no_feature",
                       dispersion = 1000,
                       lfn_sd = 0.2,
                       treatment_effects = NULL,
                       effect_avg_default = 0,
                       effect_sd_default = 0.5,
                       fraction_affected_default = 0.5,
                       log_means = NULL,
                       log_sds = NULL){

  # Hack to deal with devtools::check() NOTEs
  GF <- NULL


  mode <- match.arg(mode)


  if(mode == "standard"){

    ### NOTE: This is obviously currently a very trivial wrapper that could
    ### just be the default behavior of SimulateMultiCondition(). That being said,
    ### I suspect that there will be other aspects of simulation parameter setting
    ### that I would like to automate, so for now I will keep this as is

    ### Set parameters
    if(is.null(metadf)){

      mean_formula <- stats::as.formula('~treatment-1')
      metadf_s4U <- dplyr::tibble(sample = paste0('sample', 1:(nreps*ntreatments)),
                                  treatment = rep(paste0('treatment', 1:ntreatments),
                                                  each = nreps),
                                  label_time = label_time)

      ctl_start <- nreps*ntreatments + 1
      ctl_end <- nreps*ntreatments + nctlreps*ntreatments

      metadf_ctl <- dplyr::tibble(sample = paste0('sample', ctl_start:ctl_end),
                                  treatment = rep(paste0('treatment', 1:ntreatments),
                                                  each = nctlreps),
                                  label_time = 0)

      metadf <- dplyr::bind_rows(metadf_s4U,
                                 metadf_ctl)


      simdata <- SimulateMultiCondition(nfeatures = nfeatures, metadf = metadf,
                                        mean_formula = mean_formula,
                                        param_details = param_details,
                                        seqdepth = seqdepth, label_time = label_time,
                                        pnew = pnew, pold = pold, readlength = readlength,
                                        Ucont_alpha = Ucont_alpha, Ucont_beta = Ucont_beta,
                                        feature_prefix = feature_prefix,
                                        dispslope = dispslope, dispint = dispint,
                                        logkdegsdtrend_slope = logkdegsdtrend_slope,
                                        logkdegsdtrend_intercept = logkdegsdtrend_intercept,
                                        logksynsdtrend_slope = logksynsdtrend_slope,
                                        logksynsdtrend_intercept = logksynsdtrend_intercept,
                                        logkdeg_mean = logkdeg_mean, logkdeg_sd = logkdeg_sd,
                                        logksyn_mean = logksyn_mean, logksyn_sd = logksyn_sd,
                                        logkdeg_diff_avg = logkdeg_diff_avg,
                                        logksyn_diff_avg = logksyn_diff_avg,
                                        logkdeg_diff_sd = logkdeg_diff_sd,
                                        logksyn_diff_sd = logksyn_diff_sd,
                                        pdiff_kd = pdiff_kd, pdiff_ks = pdiff_ks,
                                        pdiff_both = pdiff_both, pdo = pdo)


      simdata[['metadf']] <- metadf %>%
        dplyr::rename(tl = label_time)

    }else{

      simdata <- SimulateMultiCondition(nfeatures = nfeatures, metadf = metadf,
                                        mean_formula = mean_formula,
                                        param_details = param_details,
                                        seqdepth = seqdepth, label_time = label_time,
                                        pnew = pnew, pold = pold, readlength = readlength,
                                        Ucont_alpha = Ucont_alpha, Ucont_beta = Ucont_beta,
                                        feature_prefix = feature_prefix,
                                        dispslope = dispslope, dispint = dispint,
                                        logkdegsdtrend_slope = logkdegsdtrend_slope,
                                        logkdegsdtrend_intercept = logkdegsdtrend_intercept,
                                        logksynsdtrend_slope = logksynsdtrend_slope,
                                        logksynsdtrend_intercept = logksynsdtrend_intercept,
                                        logkdeg_mean = logkdeg_mean, logkdeg_sd = logkdeg_sd,
                                        logksyn_mean = logksyn_mean, logksyn_sd = logksyn_sd,
                                        logkdeg_diff_avg = logkdeg_diff_avg,
                                        logksyn_diff_avg = logksyn_diff_avg,
                                        logkdeg_diff_sd = logkdeg_diff_sd,
                                        logksyn_diff_sd = logksyn_diff_sd,
                                        pdiff_kd = pdiff_kd, pdiff_ks = pdiff_ks,
                                        pdiff_both = pdiff_both, pdo = pdo)



    }



  }else{
    ### EZDYNAMICS WRAPPER

    ode_models_internal <- create_odemodels_internally()

    preset <- match.arg(dynamics_preset)

    graph <- ode_models_internal[[preset]][["graph"]]
    formulas <- ode_models_internal[[preset]][["formulas"]]


    formula_list <- vector(mode = "list",
                           length = length(label_time)*length(formulas)*nreps*ntreatments)

    tl_vect <- rep(0, times = length(formula_list))
    compartment_vect <- rep(0, times = length(formula_list))
    for(i in 1:length(formula_list)){

      formula_list[[i]] <- formulas[[( (i-1) %% length(formulas)) + 1 ]]
      tl_vect[i] <- label_time[((ceiling(i/length(formulas)) - 1) %% length(label_time)) + 1]
      compartment_vect[i] <- names(formulas)[( (i-1) %% length(formulas)) + 1 ]

    }

    treatment_vect <- rep(1:ntreatments,
                          each = length(label_time)*length(formulas)*nreps)

    names(formula_list) <- paste0("sample",  generate_pattern(length(formula_list)))



    metadf <- dplyr::tibble(sample = paste0("sample",  generate_pattern(length(formula_list))) ,
                            compartment = compartment_vect,
                            tl = tl_vect,
                            treatment = paste0("treatment", treatment_vect))

    # means of log of parameters
    if(is.null(log_means)){

      log_means <- c(1, seq(from = -0.5, to = -2.5,
                            length.out = (max(graph)-1)))

    }

    # population sds on log scale of parameters
    if(is.null(log_sds)){

      log_sds <- rep(0.4, times = max(graph))

    }


    simdata <- SimulateDynamics(nfeatures = nfeatures,
                                graph = graph,
                                metadf = metadf,
                                formula_list = formula_list,
                                log_means = log_means,
                                log_sds = log_sds,
                                unassigned_name = unassigned_name,
                                seqdepth = seqdepth,
                                dispersion = dispersion,
                                lfn_sd = lfn_sd,
                                effect_avg_default = effect_avg_default,
                                effect_sd_default = effect_sd_default,
                                fraction_affected_default = fraction_affected_default,
                                ntreatments = ntreatments,
                                treatment_effects = treatment_effects)

    # Need to impute grouping feature in pre-RNA case
    if(preset %in% c("preRNA", "preRNAwithPdeg", "nuc2cytowithpreRNA")){
      simdata$cB <- simdata$cB %>%
        dplyr::mutate(feature = dplyr::case_when(
          GF == unassigned_name ~ XF,
          .default = GF
        ))
    }


    simdata$metadf <- metadf

  }

  return(simdata)


}





#' Simulate NR-seq data for multiple replicates of multiple biological conditions
#'
#' `SimulateMultiCondition` is a highly flexibly simulator that combines linear modeling
#' of log(kdeg)'s and log(ksyn)'s with `SimulateOneRep` to simulate an NR-seq dataset. The linear model
#' allows you to simulate multiple distinct treatments, batch effects, interaction effects,
#' etc. The current downside for its flexibility is its relative complexity to implement.
#' Easier to use simulators are on the way to EZbakR.
#'
#' @param nfeatures Number of "features" (e.g., genes) to simulate data for
#' @param metadf A data frame with the following columns:
#' \itemize{
#'  \item sample: Names given to samples to simulate.
#'  \item \code{<details>}: Any number of columns with any names (not taken by other metadf columns)
#'  storing factors by which the samples can be stratified. These can be referenced
#'  in `mean_formula`, described below.
#' }
#' These parameters (described more below) can also be included in metadf to specify sample-specific simulation
#' parameter:
#' \itemize{
#'  \item seqdepth
#'  \item label_time
#'  \item pnew
#'  \item pold
#'  \item readlength
#'  \item Ucont
#' }
#' @param mean_formula A formula object that specifies the linear model used to
#' relate the factors in the \code{<details>} columns of `metadf` to average log(kdegs) and
#' log(ksyns) in each sample.
#' @param param_details A data frame with one row for each column of the design matrix
#' obtained from `model.matrix(mean_formula, metadf)` that describes how to simulate
#' the linear model parameters. The columns of this data frame are:
#' \itemize{
#'  \item param: Name of linear model parameter as it appears in the column names of the
#'  design matrix from `model.matrix(mean_formula, metadf)`.
#'  \item reference: Boolean; TRUE if you want to treat that parameter as a "reference". This
#'  means that all other parameter values that aren't global parameters are set equal to this
#'  unless otherwise determined (see `pdiff_*` parameters for how it is determined if a parameter
#'  will differ from the reference).
#'  \item global: Boolean; TRUE if you want to treat that parameter as a global parameter. This means
#'  that a single value is used for all features.
#'  \item logkdeg_mean: If parameter is the reference, then its value for the log(kdeg) linear model
#'  will be drawn from a normal distribution with this mean. If it is a global parameter, then this
#'  value will be used. If it is neither of these, then its value in the log(kdeg) linear model will
#'  either be the reference (if there is no difference between this condition's value and the reference)
#'  or the reference's value + a normally distributed random variable centered on this value.
#'  \item logkdeg_sd: sd used for draws from normal distribution as described for `logkdeg_mean`.
#'  \item logksyn_mean: Same as `logkdeg_mean` but for log(ksyn) linear model.
#'  \item logksyn_sd: Same as `logkdeg_sd` but for log(kdeg) linear model.
#'  \item pdiff_ks: Proportion of features whose value of this parameter in the log(ksyn) linear model
#'  will differ from the reference's. Should be a number between 0 and 1, inclusive. For example, if
#'  `pdiff_ks` is 0.1, then for 10% of features, this parameter will equal the reference parameter +
#'  a normally distributed random variable with mean `logksyn_mean` and sd `logksyn_sd`. For the other
#'  90% of features, this parameter will equal the reference.
#'  \item pdiff_kd: Same as `pdiff_ks` but for log(kdeg) linear model.
#'  \item pdiff_both: Proportion of features whose value for this parameter in BOTH the
#'  log(kdeg) and log(ksyn) linear models will differ from the reference. Value must be
#'  between 0 and min(c(pdiff_kd, pdiff_ks)) in that row.
#' }
#' If param_details is not specified by the user, the first column of the design matrix
#' is assumed to represent the reference parameter, all parameters are assumed to be
#' non-global, logkdeg_mean and logksyn_mean are set to the equivalently named parameter values
#' described below for the reference and `logkdeg_diff_avg` and `logksyn_diff_avg` for all other parameters,
#' logkdeg_sd and logksyn_sd are set to the equivalently named parameter values
#' described below for the reference and `logkdeg_diff_sd` and `logksyn_diff_sd` for all other parameters,
#' and pdiff_kd, pdiff_ks, and pdiff_both are all set to the equivalently named parameter values.
#' @param seqdepth Only relevant if `read_vect` is not provided; in that case, this is
#' the total number of reads to simulate.
#' @param label_time Length of s^4^U feed to simulate.
#' @param pnew Probability that a T is mutated to a C if a read is new.
#' @param pold Probability that a T is mutated to a C if a read is old.
#' @param readlength Length of simulated reads. In this simple simulation, all reads
#' are simulated as being exactly this length.
#' @param Ucont_alpha Probability that a nucleotide in a simulated read from a given feature
#' is a U is drawn from a beta distribution with shape1 = `Ucont_alpha`.
#' @param Ucont_beta Probability that a nucleotide in a simulated read from a given feature
#' is a U is drawn from a beta distribution with shape2 = `Ucont_beta`.
#' @param logkdeg_mean Mean of normal distribution from which reference log(kdeg)
#' linear model parameter is drawn from for each feature if `param_details` is not provided.
#' @param feature_prefix Name given to the i-th feature is `paste0(feature_prefix, i)`. Shows up in the
#' `feature` column of the output simulated data table.
#' @param dispslope Negative binomial dispersion parameter "slope" with respect to read counts. See
#' DESeq2 paper for dispersion model used.
#' @param dispint Negative binomial dispersion parameter "intercept" with respect to read counts. See
#' DESeq2 paper for dispersion model used.
#' @param logkdegsdtrend_slope Slope for log10(read count) vs. log(kdeg) replicate variability trend
#' @param logkdegsdtrend_intercept Intercept for log10(read count) vs. log(kdeg) replicate variability trend
#' @param logksynsdtrend_slope Slope for log10(read count) vs. log(ksyn) replicate variability trend
#' @param logksynsdtrend_intercept Intercept for log10(read count) vs. log(ksyn) replicate variability trend
#' @param logkdeg_sd Standard deviation of normal distribution from which reference log(kdeg)
#' linear model parameter is drawn from for each feature if `param_details` is not provided.
#' @param logksyn_mean Mean of normal distribution from which reference log(ksyn)
#' linear model parameter is drawn from for each feature if `param_details` is not provided.
#' @param logksyn_sd Standard deviation of normal distribution from which reference log(ksyn)
#' linear model parameter is drawn from for each feature if `param_details` is not provided.
#' @param logkdeg_diff_avg Mean of normal distribution from which non-reference log(kdeg)
#' linear model parameters are drawn from for each feature if `param_details` is not provided.
#' @param logkdeg_diff_sd Standard deviation of normal distribution from which reference log(kdeg)
#' linear model parameter are drawn from for each feature if `param_details` is not provided.
#' @param logksyn_diff_avg Mean of normal distribution from which reference log(ksyn)
#' linear model parameter are drawn from for each feature if `param_details` is not provided.
#' @param logksyn_diff_sd Standard deviation of normal distribution from which reference log(ksyn)
#' linear model parameter are drawn from for each feature if `param_details` is not provided.
#' @param pdiff_kd Proportion of features for which non-reference log(kdeg) linear model parameters
#' differ from the reference.
#' @param pdiff_ks Proportion of features for which non-reference log(ksyn) linear model parameters
#' differ from the reference.
#' @param pdiff_both Proportion of features for which BOTH non-reference log(kdeg) and log(ksyn) linear model parameters
#' differ from the reference.
#' ksyns are simulated
#' @param pdo Dropout rate; think of this as the probability that a s4U containing
#' molecule is lost during library preparation and sequencing. If `pdo` is 0 (default)
#' then there is not dropout.
#' @import data.table
#' @importFrom magrittr %>%
#' @return A list containing 6 elements:
#' \itemize{
#'  \item cB: Tibble that can be provided as the `cB` arg to `EZbakRData()`.
#'  \item metadf: Tibble that can be provided as the `metadf` arg to `EZbakRData()`.
#'  \item PerRepTruth: Tibble containing replicate-by-replicate simulated ground truth
#'  \item AvgTruth: Tibble containing average simulated ground truth
#'  \item param_details: Tibble containing information about simulated linear model parameters
#'  \item UnbiasedFractions: Tibble containing no dropout ground truth
#' }
#' @examples
#' simdata <- SimulateMultiCondition(30,
#'                                   data.frame(sample = c('sampleA', 'sampleB'),
#'                                   treatment = c('treatment1', 'treatment2')),
#'                                   mean_formula = ~treatment-1)
#' @export
SimulateMultiCondition <- function(nfeatures, metadf, mean_formula,
                                   param_details = NULL,
                                   seqdepth = nfeatures*2500, label_time = 2,
                                   pnew = 0.05, pold = 0.001,
                                   readlength = 200, Ucont_alpha = 25,
                                   Ucont_beta = 75,
                                   feature_prefix = "Gene",
                                   dispslope = 5, dispint = 0.01,
                                   logkdegsdtrend_slope = -0.3,
                                   logkdegsdtrend_intercept = -2.25,
                                   logksynsdtrend_slope = -0.3,
                                   logksynsdtrend_intercept = -2.25,
                                   logkdeg_mean = -1.9, logkdeg_sd = 0.7,
                                   logksyn_mean = 2.3, logksyn_sd = 0.7,
                                   logkdeg_diff_avg = 0, logksyn_diff_avg = 0,
                                   logkdeg_diff_sd = 0.5, logksyn_diff_sd = 0.5,
                                   pdiff_kd = 0.1, pdiff_ks = 0, pdiff_both = 0,
                                   pdo = 0){

  ### Hack to deal with devtools::check() NOTEs
  reference <- param <- NULL


  `.` <- list



  ### Create param_details if not provided

  mean_design <- stats::model.matrix(mean_formula, metadf)

  mean_design_cols <- colnames(mean_design)


  if(is.null(param_details)){

    lmdc <- length(mean_design_cols)

    param_details <- dplyr::tibble(
      param = mean_design_cols,
      reference = rep(c(TRUE, FALSE), times = c(1, lmdc - 1)),
      global = FALSE,
      logkdeg_mean = rep(c(logkdeg_mean, logkdeg_diff_avg), times = c(1, lmdc - 1)),
      logkdeg_sd = rep(c(logkdeg_sd, logkdeg_diff_sd), times = c(1, lmdc - 1)),
      logksyn_mean = rep(c(logksyn_mean, logksyn_diff_avg), times = c(1, lmdc - 1)),
      logksyn_sd = rep(c(logksyn_sd, logksyn_diff_sd), times = c(1, lmdc - 1)),
      pdiff_kd = pdiff_kd,
      pdiff_ks = pdiff_ks,
      pdiff_both = pdiff_both
    )

  }



  ### Check validity of input

  args <- c(as.list(environment()))

  check_SimulateMultiCondition_input(args)


  ### Fill metadf with parameters that are only specified as single value

  mcols <- colnames(metadf)

  if(!('seqdepth' %in% mcols)){

    metadf$seqdepth <- seqdepth

  }

  if(!('label_time' %in% mcols)){

    metadf$label_time <- label_time

  }

  if(!('pnew' %in% mcols)){

    metadf$pnew <- pnew

  }


  if(!('pold' %in% mcols)){

    metadf$pold <- pold

  }

  if(!('pdo' %in% mcols)){

    metadf$pdo <- pdo

  }






  ### Need to simulate linear model parameter values for all parameters specified


  # Reference log(kdegs)
  pdref <- param_details %>%
    dplyr::filter(reference)

  logkdeg_ref <- stats::rnorm(nfeatures,
                       mean = pdref$logkdeg_mean,
                       sd = pdref$logkdeg_sd)


  # Reference log(ksyns)
  logksyn_ref <- stats::rnorm(nfeatures,
                       mean = pdref$logksyn_mean,
                       sd = pdref$logksyn_sd)


  # Number of samples to simulate; to be used multiple times later
  nsamp <- nrow(metadf)

  # Preallocate lists to store log(kdeg) and log(ksyn)s linear model parameters
  # for each sample and feature
  logkdeg_params <- vector(mode = "list",
                           length = length(mean_design_cols))
  logksyn_params <- logkdeg_params

  # Determine linear model parameter values
  for(p in seq_along(mean_design_cols)){

    pd <- param_details %>%
      dplyr::filter(param == mean_design_cols[p])

    if(pd$reference){

      ### EASY: Just use reference value for reference parameters
      logkdeg_params[[p]] <- logkdeg_ref
      logksyn_params[[p]] <- logksyn_ref

    }else if(pd$global){

      ### EASY: Just use global value for global parameters
      logkdeg_params[[p]] <- rep(pd$logkdeg_mean, times = nfeatures)
      logksyn_params[[p]] <- rep(pd$logksyn_mean, times = nfeatures)

    }else{

      ### HARD: Need to simulate non-reference, non-global values with respect
      ### to the references. Some fraction of these will differ from reference,
      ### some fraction will be exactly the same as the reference.
      ###
      ### User also specifies what fraction of the time they want both kdeg and
      ### ksyn to differ with respect to reference.

      ndiff_kd <- pd$pdiff_kd*nfeatures
      ndiff_ks <- pd$pdiff_ks*nfeatures
      ndiff_both <- pd$pdiff_both*nfeatures

      diff_kd_end <- ceiling(ndiff_kd)
      diff_ks_start <- ceiling(diff_kd_end - ndiff_both)
      diff_ks_end <- diff_ks_start + ndiff_ks

      is_kdeg_param_nonzero <- rep(c(1, 0),
                                   times = c(diff_kd_end,
                                             nfeatures - diff_kd_end))

      is_ksyn_param_nonzero <- rep(c(0, 1, 0),
                                   times = c(diff_ks_start,
                                             ndiff_ks,
                                             nfeatures - diff_ks_start - ndiff_ks))

      logkdeg_params[[p]] <- is_kdeg_param_nonzero*stats::rnorm(nfeatures,
                                                         pd$logkdeg_mean,
                                                         pd$logkdeg_sd) +
        logkdeg_ref

      logksyn_params[[p]] <- is_ksyn_param_nonzero*stats::rnorm(nfeatures,
                                                         pd$logksyn_mean,
                                                         pd$logksyn_sd) +
        logksyn_ref


    }

  }

  # Function to extract the ith element of a vector
  # For grabbing a feature's set of parameter from logkdeg/ksyn_params
  extract_ith <- function(list, i) {
    sapply(list, function(x) x[i])
  }

  # Function to compute mean log(kdeg), log(ksyn), and abundance
  # in each sample.
  compute_kinetics <- function(X, logkdeg_params,
                               logksyn_params,
                               pdo,
                               tl,
                               n = nfeatures) {

    abundance <- vector("list", n)
    logkdeg <- abundance
    logksyn <- abundance

    for (i in 1:n) {
      logksyn_i <- X %*% extract_ith(logksyn_params, i)
      logkdeg_i <- X %*% extract_ith(logkdeg_params, i)

      logkdeg[[i]] <- logkdeg_i
      logksyn[[i]] <- logksyn_i

      # Compute dropout adjusted abundance; just ksyn/kdeg if no dropout
      fn <- 1 - exp(-exp(logkdeg_i)*tl)
      abundance[[i]] <- (exp(logksyn_i)/exp(logkdeg_i))*((1 - fn) + (1 - pdo)*fn)

    }

    return(list(abundance = abundance,
                logkdeg = logkdeg,
                logksyn = logksyn))
  }

  # Function to normalize abundances and calculate expected read count
  # for each feature in each sample
  compute_readcounts <- function(abundances,
                                 seqdepths,
                                 n = nsamp){


    sums <- rep(0, times = n)
    means <- sums
    sds <- sums

    for(i in 1:n){

      abundances_i <- extract_ith(abundances, i)



      sums[i] <- sum(abundances_i)
      means[i] <- mean(log10(abundances_i))
      sds[i] <- stats::sd(log10(abundances_i))

    }

    readcounts <- lapply(abundances,
                         function(vector) (vector/sums)*seqdepths)
    zscores <- lapply(abundances,
                      function(vector) (log10(vector) - means)/sds)
    return(list(readcounts = readcounts,
                read_zscores = zscores))


  }


  ### Determine sample-specific averages to simulate from

  kinetics <- compute_kinetics(mean_design, logkdeg_params,
                               logksyn_params,
                               pdo = metadf$pdo,
                               tl = metadf$label_time)


  reads <- compute_readcounts(kinetics$abundance,
                              seqdepths = metadf$seqdepth)


  kinetics_and_reads <- c(kinetics, reads)


  ### Simulate per-replicate values using calculated means
  # log(kdeg) ~ Normal()
  # log(ksyn) ~ Normal()
  # read count ~ Negative Binomial()

  logkdegs <- vector(mode = "list",
                     length = nfeatures)
  logksyns <- logkdegs
  reads <- logkdegs

  for(i in 1:nfeatures){

    feature_logkdegs <- kinetics_and_reads$logkdeg[[i]]
    feature_logksyns <- kinetics_and_reads$logksyn[[i]]
    feature_readavgs <- kinetics_and_reads$readcounts[[i]]

    logkdeg_sds <- exp(kinetics_and_reads$read_zscores[[i]]*logkdegsdtrend_slope +
                         logkdegsdtrend_intercept)

    logksyn_sds <- exp(kinetics_and_reads$read_zscores[[i]]*logksynsdtrend_slope +
                         logksynsdtrend_intercept)



    ### Each element is a vector of sample-specific kinetic parameters and read counts
    # NOTE: bit weird to define sample-specific kinetic parameters rather than
    # sample specific fraction news. From a practical perspective though, this
    # is equivalent to that strategy.
    logkdegs[[i]] <- stats::rnorm(n = nsamp,
                           mean = feature_logkdegs,
                           sd = logkdeg_sds)

    logksyns[[i]] <- stats::rnorm(n = nsamp,
                           mean = feature_logksyns,
                           sd = logksyn_sds)

    reads[[i]] <- stats::rnbinom(n = nsamp,
                          mu = feature_readavgs,
                          size = 1/(dispslope/feature_readavgs + dispint))


  }


  ### Simulate data for each replicate
  simdata <- vector(mode = "list", length = nrow(metadf))
  for(s in 1:nrow(metadf)){


    pdo <- metadf$pdo[s]

    kdeg_vect <- exp(extract_ith(logkdegs, s))
    fn_vect <- 1 - exp(-kdeg_vect*metadf$label_time[s])

    # Dropout adjusted
    fn_vect <- (fn_vect*(1 - pdo))/(fn_vect*(1 - pdo) + (1 - fn_vect))
    kdeg_vect <- -log(1 - fn_vect)/metadf$label_time[s]



    # A bit unclear just from the interface (so suboptimal function design)
    # but the key here is that if there is dropout, fn_vect will represent
    # the dropout biased fraction new, and kdeg_vect will represent the
    # kdeg that would be estimated from the true, unbiased fraction new.
    # This ensures that the ground_truth table from this function contains
    # both the biased and unbiased fraction new estimates in it.
    simdata[[s]] <- SimulateOneRep(nfeatures = nfeatures,
                                   read_vect = extract_ith(reads, s),
                                   label_time = as.numeric(metadf[s,"label_time"]),
                                   sample_name = as.character(metadf[s, "sample"]),
                                   fn_vect = fn_vect,
                                   kdeg_vect = exp(extract_ith(logkdegs, s)),
                                   ksyn_vect = exp(extract_ith(logksyns, s)),
                                   pnew = as.numeric(metadf[s, "pnew"]),
                                   pold = as.numeric(metadf[s, "pold"]),
                                   readlength = readlength,
                                   Ucont_alpha = Ucont_alpha,
                                   Ucont_beta = Ucont_beta)

  }

  # Combine replicate simulation data objects into one
  names_to_bind <- names(simdata[[1]])

  final_simdata <- lapply(names_to_bind, function(name) {
    dplyr::bind_rows(lapply(simdata, function(inner_list) inner_list[[name]]))
  })

  names(final_simdata) <- names_to_bind


  ### Save linear model parameters in convenient format

  names(logkdeg_params) <- paste0("true_logkdeg_", mean_design_cols)
  names(logksyn_params) <- paste0("true_logksyn_", mean_design_cols)

  feature_prefix <- "Gene"

  kinetic_parameters <- dplyr::bind_cols(list(logkdeg_params, logksyn_params))
  kinetic_parameters[['feature']] <- paste0(feature_prefix, 1:nfeatures)



  ### Gather output

  # TO-DO: Am not tracking the replicate fraction news in a way
  # that accounts for potential dropout. PerRepTruth will only
  # contain the dropout biased fraction new estimate
  if(all(metadf$pdo == 0)){

    output <- list(cB = final_simdata$cB,
                   PerRepTruth = final_simdata$ground_truth,
                   AvgTruth = kinetic_parameters,
                   metadf = metadf,
                   param_details = param_details)

  }else{

    nodropout_truth <- vector(mode = "list", length = nrow(metadf))
    for(s in 1:nrow(metadf)){

      pdo <- metadf$pdo[s]

      kdeg_vect <- exp(extract_ith(logkdegs, s))
      fn_vect <- 1 - exp(-kdeg_vect*metadf$label_time[s])


      nodropout_truth[[s]] <- data.table::data.table(sample = metadf$sample[s],
                                      feature = paste0(feature_prefix, 1:nfeatures),
                                      unbiased_fraction_highTC = fn_vect,
                                      unbiased_kdeg = kdeg_vect)

    }

    output <- list(cB = final_simdata$cB,
                   PerRepTruth = final_simdata$ground_truth,
                   UnbiasedFractions = dplyr::bind_rows(nodropout_truth),
                   AvgTruth = kinetic_parameters,
                   metadf = metadf,
                   param_details = param_details)

  }


  return(output)


}



#' Simulation of transcript isoform kinetic parameters.
#'
#' `SimulateIsoforms()` performs a simple simulation of isoform-specific kinetic
#' parameters to showcase and test `EstimateIsoformFractions()`. It assumes that
#' there are a set of reads (fraction of total set by `funique` parameter) which
#' map uniquely to a given isoform, while the rest are ambiguous to all isoforms
#' from that gene. Mutational content of these reads are simulated as in
#' `SimulateOneRep()`.
#'
#' @param nfeatures Number of "features" to simulate data for. Each feature will
#' have a simulated number of transcript isoforms
#' @param nt (Optional), can provide a vector of the number of isoforms you would
#' like to simulate for each of the `nfeatures` features. Vector can either be length
#' 1, in which case that many isoforms will be simulated for all features, or length
#' equal to `nfeatures`.
#' @param seqdepth Total number of sequencing reads to simulate
#' @param label_time Length of s^4^U feed to simulate.
#' @param feature_prefix Name given to the i-th feature is `paste0(feature_prefix, i)`. Shows up in the
#' `feature` column of the output simulated data table.
#' @param sample_name Character vector to assign to `sample` column of output simulated
#' data table (the cB table).
#' @param pnew Probability that a T is mutated to a C if a read is new.
#' @param pold Probability that a T is mutated to a C if a read is old.
#' @param funique Fraction of reads that uniquely "map" to a single isoform.
#' @param readlength Length of simulated reads. In this simple simulation, all reads
#' are simulated as being exactly this length.
#' @param Ucont Percentage of nucleotides simulated to be U's.
#' @param avg_numiso Average number of isoforms for each feature. Feature-specific
#' isoform counts are drawn from a Poisson distribution with this average. NOTE:
#' to insure that all features have multiple isoforms, the simulated number of
#' isoforms drawn from a Poisson distribution is incremented by 2. Thus, the
#' actual average number of isoforms from each feature is `avg_numiso` + 2.
#' @param psynthdiff Percentage of genes for which all isoform abundance differences
#' are synthesis driven. If not synthesis driven, then isoform abundance differences
#' will be driven by differences in isoform kdegs.
#' @param logkdeg_mean meanlog of a log-normal distribution from which
#' kdegs are simulated
#' @param logkdeg_sd sdlog of a log-normal distribution from which
#' kdegs are simulated
#' @param logksyn_mean meanlog of a log-normal distribution from which
#' ksyns are simulated
#' @param logksyn_sd sdlog of a log-normal distribution from which
#' ksyns are simulated
#' @importFrom magrittr %>%
#' @return List with two elements:
#' \itemize{
#'  \item cB: Tibble that can be passed as the `cB` arg to `EZbakRData()`.
#'  \item ground_truth: Tibble containing simulated ground truth.
#' }
#' @examples
#' simdata <- SimulateIsoforms(30)
#' @export
SimulateIsoforms <- function(nfeatures,
                             nt = NULL,
                             seqdepth = nfeatures*2500,
                             label_time = 4,
                             sample_name = 'sampleA',
                             feature_prefix = 'Gene',
                             pnew = 0.1,
                             pold = 0.002,
                             funique = 0.2,
                             readlength = 200,
                             Ucont = 0.25,
                             avg_numiso = 2,
                             psynthdiff = 0.5,
                             logkdeg_mean = -1.9, logkdeg_sd = 0.7,
                             logksyn_mean = 2.3, logksyn_sd = 0.7
                             ){

  ### Hack to deal with devtools::check() NOTEs
  feature <- GF <- transcripts <- TC <- nT <- n <- NULL


  if(is.null(nt)){

    # Number of isoforms per genes (make them all multi-isoformgenes)
    nt <- stats::rpois(nfeatures, avg_numiso) + 2

  }else if(length(nt) == 1){

    nt <- rep(nt, times = nfeatures)

  }

  # Is synthesis different
  syn_driven <- as.logical(stats::rbinom(nfeatures, size = 1, prob = psynthdiff))

  # Proportions for each transcript
  pt <- rep(0, times = sum(nt))

  tracker <- 1
  for(i in 1:nfeatures){

    # Simulate one dominant isoform and then a number of less prevalent isoforms
    # Idea is to generate a number on an unnormalized scale (9 = dominant,
    # other isoforms randomly assigned a real number between 1 and 4, uniform
    # distribution over that range), and then normalize it for each gene (i.e,
    # divide unnormalized number by sum of unnormalized numbers for that gene)
    pt[tracker:(tracker+(nt[i] - 1))] <- c(9, stats::runif(n = (nt[i] - 1), min = 1, max = 4))
    pt[tracker:(tracker+(nt[i] - 1))] <- pt[tracker:(tracker+(nt[i] - 1))]/sum(pt[tracker:(tracker+(nt[i] - 1))])

    tracker <- tracker + nt[i]
  }


  # ksyn for each gene
  ks_g <- stats::rlnorm(nfeatures, meanlog = logksyn_mean,
                 sdlog = logksyn_sd)

  # kdeg for each gene
  kd_g <- stats::rlnorm(nfeatures, meanlog = logkdeg_mean,
                 sdlog = logkdeg_sd)

  # Simulate ks and kdeg for each transcript -------------------------------------

  gene_vect <- rep(1:nfeatures, times = nt)

  ks_t <- rep(0, times = sum(nt))
  kd_t <- rep(0, times = sum(nt))

  for(i in 1:sum(nt)){

    dom_p <- max(pt[gene_vect == gene_vect[i]])

    # Dominant isoform is given the simulated gene-wide kinetic parameters
    # Each non-dominant isoform is either termed a "synthesis-driven" alternative
    # isoform or a "degradation-driven". A synthesis-driven alt. isoform is one
    # whose lower than dominant abundance is driven by a lower rate of synthesis
    # for that gene. A degradation-driven alt. isoform is one whose lower than
    # dominant abundance is driven by lower stability. Synthesis and degradation
    # rate constants are chosen so that steady-state abundances are consistent
    # with the isoform percentages simulated earlier
    if(pt[i] == dom_p){
      ks_t[i] <- ks_g[gene_vect[i]]
      kd_t[i] <- kd_g[gene_vect[i]]
    }else{
      if(syn_driven[gene_vect[i]]){
        kd_t[i] <- kd_g[gene_vect[i]]

        ks_t[i] <- ((pt[i])/dom_p)*ks_g[gene_vect[i]]
      }else{

        ks_t[i] <- ks_g[gene_vect[i]]

        kd_t[i] <- (dom_p/pt[i])*kd_g[gene_vect[i]]

      }

    }

  }


  # Simulate reads from each transcript ------------------------------------------

  ### Non-unique reads
  # I randomly choose some fraction of reads to be uniquely mapping to one isoform
  # and some fraction to map to multiple isoforms.
  # For the non-unique isoforms, I assume that the probability a read mapped to
  # a given transcript is simply that transcript's relative abundance (i.e.,
  # the proportion of RNA generated from that gene which become that isoform.
  # this is `pt` simulated earlier).
  # NOTE:
  # I got confused once about how the mutational data is simulated in the case
  # of non-unique reads. These reads are unambiguously from a given transcript
  # isoform, so the fraction new that drives the simulation of the mutational
  # data is just that isoform's simulated fraction new. At one point I wondered
  # why I wasn't probablistically determining the transcript of origin for
  # a non-unique read. This is unnecessary as the number of reads originating
  # from a given isoform takes into account the relative abundance of each isoform.

  if(funique < 1){
    reads <- ceiling(((ks_t/kd_t)/sum(ks_t/kd_t))*seqdepth*(1-funique))
    fns <- rep(1 - exp(-kd_t*label_time), times = reads)


    # Number of Us
    nUs <- stats::rbinom(n = sum(reads), size = readlength, prob = Ucont)

    # Newness
    newness <- stats::rbinom(n = sum(reads), size = 1, p = fns)

    # Mutations
    TCs <- stats::rbinom(n = sum(reads), size = nUs,
                         prob = newness*pnew + (1 - newness)*pold )

    # Transcript IDs
    transcript_vect <- rep('', times = nfeatures)
    for(i in 1:nfeatures){
      ### TO-DO: Make this a prefix a user provided parameter
      transcript_set <- paste0(feature_prefix, i, "_", "Transcript", 1:nt[i])
      transcript_vect[i] <- paste(transcript_set, collapse = "+")

    }

    # Make cB
    cB <- dplyr::tibble(TC = TCs, nT = nUs, read_ID = 1:length(TCs),
                 GF = paste0(feature_prefix, rep(gene_vect, times = reads)),
                 transcripts = rep(rep(transcript_vect, times = nt), times = reads))

  }else{
    cB <- dplyr::tibble()
  }



  ### Unique reads


  reads_u <- ceiling(((ks_t/kd_t)/sum(ks_t/kd_t))*seqdepth*(funique))
  fns <- rep(1 - exp(-kd_t*label_time), times = reads_u)


  # Number of Us
  nUs <- stats::rbinom(n = sum(reads_u), size = readlength, prob = Ucont)

  # Newness
  newness <- stats::rbinom(n = sum(reads_u), size = 1, p = fns)

  # Mutations
  TCs <- stats::rbinom(n = sum(reads_u), size = nUs, prob = newness*pnew + pold - newness*pold )

  # Transcript IDs
  gene_vect <- rep(1:nfeatures, times = nt)
  transcript_vect <- c()
  for(i in 1:nfeatures){
    ### TO-DO: Make this a prefix a user provided parameter
    transcript_set <- paste0(feature_prefix, i, "_", "Transcript", 1:nt[i])
    transcript_vect <- c(transcript_vect, transcript_set)

  }

  # Make cB
  cB_u <- dplyr::tibble(TC = TCs, nT = nUs, read_ID = (nrow(cB) + 1):(nrow(cB) + length(TCs)),
                 GF = paste0(feature_prefix, rep(gene_vect, times = reads_u)),
                 transcripts = rep(transcript_vect, times = reads_u))


  if(funique >= 1){
    # Merge
    cB <- cB_u
  }else{
    # Merge
    cB <- dplyr::bind_rows(cB, cB_u)
  }


  ### Assemble ground truth and data

  cB <- cB %>%
    dplyr::rename(feature = GF) %>%
    dplyr::group_by(feature, transcripts, TC, nT) %>%
    dplyr::count() %>%
    dplyr::mutate(sample = sample_name) %>%
    dplyr::select(sample, feature, transcripts, TC, nT, n)

  truth <- dplyr::tibble(
    feature = paste0(feature_prefix, gene_vect),
    transcript_id = transcript_vect,
    true_kdeg = kd_t,
    true_ksyn = ks_t,
    true_fn = 1 - exp(-kd_t*label_time),
    true_count = seqdepth*ceiling((ks_t/kd_t)/sum(ks_t/kd_t)),
    true_TPM = (ks_t/kd_t)/(sum(ks_t/kd_t)/1000000)
  )

  return(list(cB = cB,
              ground_truth = truth))


}



#########################
###### PARAMETER CHECKS #
#########################


check_SimulateOneRep_input <- function(args){

  ### nfeatures

  NF <- args$nfeatures

  if(!is.numeric(NF)){

    stop("nfeatures must be numeric!")

  }

  if(NF < 1){
    stop("nfeatures must be >= 1!")
  }

  if(round(NF) != NF){

    stop("nfeatures must be an integer!")

  }


  ### read_vect
  rv <- args$read_vect

  if(!is.null(rv)){

    if(length(rv) != 1 & length(rv) != NF){

      stop("read_vect must be either length 1 or length nfeatures!")

    }

    if(!all(is.numeric(rv))){

      stop("All elements of read_vect must be numeric!")

    }

    if(!all(rv >= 0)){

      stop("All elements of read_vect must be >= 0!")
    }

    if(!all(round(rv) == rv)){

      stop("All elements of read_vect must be integers!")

    }

  }


  ### label_time
  tl <- args$label_time

  if(!is.numeric(tl)){

    stop("label_time must be numeric")

  }

  if(tl < 0){

    stop("label_time must be >= 0!")

  }


  ### sample_name
  sname <- args$sample_name

  if(!is.character(sname)){

    stop("sample_name should be a string!")

  }

  if(length(sname) > 1){

    stop("sample_name should be a single string!")

  }

  ### feature_prefix
  fp <- args$feature_prefix

  if(!is.character(fp)){

    stop("feature_prefix should be a string!")

  }

  if(length(fp) > 1){

    stop("feature_prefix should be a single string!")

  }

  ### logkdeg_mean
  lkd <- args$logkdeg_mean

  if(!is.numeric(lkd)){

    stop("logkdeg_mean must be numeric")

  }


  ### logkdeg_sd
  lkd_sd <- args$logkdeg_sd

  if(!is.numeric(lkd_sd)){

    stop("logkdeg_mean must be numeric!")

  }

  if(lkd_sd <= 0){

    stop("logkdeg_sd must be >= 0")

  }


  ### logksyn_mean
  lks <- args$logksyn_mean

  if(!is.numeric(lks)){

    stop("logksyn_mean must be numeric!")

  }


  ### logkdeg_sd
  lks_sd <- args$logksyn_sd

  if(!is.numeric(lks_sd)){

    stop("logksyn_mean must be numeric!")

  }

  if(lks_sd <= 0){

    stop("logksyn_sd must be >= 0")

  }


  ### seqdepth
  sdep <- args$seqdepth

  if(!is.numeric(sdep)){

    stop("seqdepth must be numeric")

  }

  if(sdep <= 0){

    stop("seqdepth must be > 0")

  }

  if(round(sdep) != sdep){

    stop("seqdepth must be an integer!")

  }


  ### pnew and old
  pnew <- args$pnew
  pold <- args$pold

  if(!is.numeric(pnew)){

    stop("pnew must be numeric!")

  }

  if(!is.numeric(pold)){

    stop("pold must be numeric!")

  }

  if(pnew <= 0){

    stop("pnew must be > 0")

  }

  if(pold < 0){

    stop("pnew must be >= 0")

  }

  if(pnew > 1){
    stop("pnew must be <= 1")
  }

  if(pold >= 1){
    stop("pold must be < 1")
  }

  if(pnew <= pold){
    stop("pnew must be strictly greater than pnew!")
  }



  ### Ucont parameters
  Ucont_alpha <- args$Ucont_alpha

  if(!is.numeric(Ucont_alpha)){

    stop("Ucont_alpha must be numeric!")

  }

  if(Ucont_alpha <= 0){

    stop("Ucont_alpha must be > 0")

  }


  Ucont_beta <- args$Ucont_beta

  if(!is.numeric(Ucont_beta)){

    stop("Ucont_beta must be numeric!")

  }

  if(Ucont_beta <= 0){

    stop("Ucont_beta must be > 0")

  }


}



check_SimulateMultiCondition_input <- function(args){

  metadf <- args$metadf
  pd <- args$param_details

  ### nfeatures

  NF <- args$nfeatures

  if(!is.numeric(NF)){

    stop("nfeatures must be numeric!")

  }

  if(NF < 1){
    stop("nfeatures must be >= 1!")
  }

  if(round(NF) != NF){

    stop("nfeatures must be an integer!")

  }


  ### label_time
  tl <- args$label_time

  if(!is.numeric(tl)){

    stop("label_time must be numeric")

  }

  if(tl < 0){

    stop("label_time must be >= 0!")

  }


  ### sample_name
  sname <- args$metadf$sample

  if(!all(is.character(sname))){

    stop("All elements of metadf column `sample` must be strings!")

  }


  ### feature_prefix
  fp <- args$feature_prefix

  if(!is.character(fp)){

    stop("feature_prefix should be a string!")

  }

  if(length(fp) > 1){

    stop("feature_prefix should be a single string!")

  }


  ### logkdeg_sd
  lkd_sd <- args$logkdeg_sd

  if(!is.numeric(lkd_sd)){

    stop("logkdeg_mean must be numeric!")

  }

  if(lkd_sd <= 0){

    stop("logkdeg_sd must be >= 0")

  }


  ### seqdepth
  sdep <- args$seqdepth

  if(!is.numeric(sdep)){

    stop("seqdepth must be numeric")

  }

  if(sdep <= 0){

    stop("seqdepth must be > 0")

  }

  if(round(sdep) != sdep){

    stop("seqdepth must be an integer!")

  }


  ### pnew and old
  pnew <- args$pnew
  pold <- args$pold

  if(!is.numeric(pnew)){

    stop("pnew must be numeric!")

  }

  if(!is.numeric(pold)){

    stop("pold must be numeric!")

  }

  if(pnew <= 0){

    stop("pnew must be > 0")

  }

  if(pold < 0){

    stop("pnew must be >= 0")

  }

  if(pnew > 1){
    stop("pnew must be <= 1")
  }

  if(pold >= 1){
    stop("pold must be < 1")
  }

  if(pnew <= pold){
    stop("pnew must be strictly greater than pnew!")
  }



  ### Ucont parameters
  Ucont_alpha <- args$Ucont_alpha

  if(!is.numeric(Ucont_alpha)){

    stop("Ucont_alpha must be numeric!")

  }

  if(Ucont_alpha <= 0){

    stop("Ucont_alpha must be > 0")

  }


  Ucont_beta <- args$Ucont_beta

  if(!is.numeric(Ucont_beta)){

    stop("Ucont_beta must be numeric!")

  }

  if(Ucont_beta <= 0){

    stop("Ucont_beta must be > 0")

  }



}


#' Simulation of generalized dynamical system model.
#'
#' `SimulateDynamics()` simulates any specified dynamical system of interconverting
#' RNA species. Its required input is similar to that of `EstimateDynamics()`, i.e.,
#' an adjacency matrix describing the set of species and how they are related to
#' one another and a list of formula relating actually assayed species to the
#' modeled species. Currently, `SimulateDynamics()` implements a naive heteroskedastic
#' replicate variability simulation and is not designed to simulate multiple experimental
#' conditions.
#'
#' @param nfeatures Number of "features" to simulate data for. A "feature" in this case
#' may contain a number of "sub-features". For example, you may want to simulate pre-RNA
#' and mature RNA for a set of "genes", in which case the number of features is the number
#' of genes.
#' @param graph An adjacency matrix describing the reaction diagram graph relating
#' the various RNA species to one another.
#' @param metadf Data frame with two required columns (`sample` and `tl`). `sample`
#' represents names given to each simulated sample. `tl` represents the label time
#' for that sample. Additional columns can specify other features of the sample,
#' like what subcellular compartment the sample is taken from. **NOTE: Not sure I am
#' actually using these optional columns in any useful capacity anymore**.
#' @param log_means Vector of log-Normal logmeans from which the distribution of
#' feature-specific parameters will be drawn from. Length of vector should be the same
#' as max(entries in `graph`), i.e., the number of parameters in your specified model.
#' @param log_sds Vector of log-Normal logsds from which the distribution of
#' feature-specific parameters will be drawn from.
#' @param ntreatments Number of distinct experimental treatments to simulate. By default,
#' only a single "treatment" (you might refer to this as wild-type, or control) is simulated.
#' Increase this if you would like to explore performing comparative dynamical systems
#' modeling.
#' @param treatment_effects Data frame describing effects of treatment on each
#' parameter. Should have five columns: "parameter_index", "treatment_index", "mean", "sd",
#' and "fraction_affected".
#' Each row corresponds to the effect the ith (i = treatment_index) treatment has on the
#' jth (j = parameter_index) kinetic parameter. Effect sizes, on a log-scale, are drawn from
#' a Normal distribution with mean and standard deviation set by the mean and sd columns,
#' respectively. The number of non-zero effects is set by "fraction_affected", and is
#' equal to `ceiling(nfeatures * fraction_affected)`. treatment_index of 1 will be ignored
#' and can either be included or not.
#' @param formula_list A list of named lists. The names of each sub-list should be
#' the same as the sample names as they are found in `metadf`. Each sub-list should be
#' a list of formula relating feature names that will show up as columns of the simulated
#' cB to species modeled in your `graph`. This only needs to be specified if you want
#' to simulate the scenario where some of the measured species are a sum of modeled species.
#' @param unassigned_name String to give to reads not assigned to a given feature.
#' @param seqdepth Total number of reads in each sample.
#' @param dispersion Negative binomial `size` parameter to use for simulating read counts
#' @param lfn_sd Logit(fn) replicate variability.
#' @param effect_avg_default If `ntreatments` > 1, and `treatment_effects` is not
#' provided, this will be the value of `mean` for all treatments and parameters imputed
#' in `treatment_effects`.
#' @param effect_sd_default If `ntreatments` > 1, and `treatment_effects` is not
#' provided, this will be the value of `sd` for all treatments and parameters imputed
#' in `treatment_effects`.
#' @param fraction_affected_default If `ntreatments` > 1, and `treatment_effects` is not
#' provided, this will be the value of `fraction_affected` for all treatments and parameters imputed
#' in `treatment_effects`.
#' @param ... Parameters passed to `SimulateOneRep()`.
#' @importFrom magrittr %>%
SimulateDynamics <- function(nfeatures, graph, metadf,
                             log_means, log_sds,
                             ntreatments = 1,
                             treatment_effects = NULL,
                             formula_list = NULL,
                             unassigned_name = "__no_feature",
                             seqdepth = nfeatures * 2500,
                             dispersion = 100, lfn_sd = 0.2,
                             effect_avg_default = 0,
                             effect_sd_default = 0.5,
                             fraction_affected_default = 0.5,
                             ...){


  # Hack to deal with devtools::check() NOTEs
  ss <- feature_type <- feature <- NULL
  parameter_index <- treatment_index <- sd <- fraction_affected <- NULL
  treatment <- NULL

  ### Step 0, generate parameters for each feature

  param_per_treatment <- vector(mode = "list",
                                length = ntreatments)

  # Infer treatment_effects if not present and needed
  if(is.null(treatment_effects) & ntreatments > 1){
    treatment_effects <- automatic_treatment_effects(nt = ntreatments,
                                                     np = max(graph),
                                                     default_mean = effect_avg_default,
                                                     default_sd = effect_sd_default,
                                                     default_affected = fraction_affected_default)
  }

  for(t in 1:ntreatments){

    param_list <- vector(mode = "list", length = length(log_means))

    for(lms in seq_along(log_means)){

      if(t == 1){

        param_list[[lms]] <- stats::rlnorm(nfeatures,
                                           meanlog = log_means[lms],
                                           sdlog = log_sds[lms])

      }else{

        effect_details <- treatment_effects %>%
          dplyr::filter(parameter_index == lms &
                          treatment_index == t)

        avg_effect <- effect_details %>%
          dplyr::select(mean) %>%
          unlist() %>%
          unname()

        effect_size <- effect_details %>%
          dplyr::select(sd) %>%
          unlist() %>%
          unname()

        num_affected <- ceiling(
          effect_details %>%
          dplyr::select(fraction_affected) %>%
          unlist() %>%
          unname() * nfeatures
        )

        num_null <- nfeatures - num_affected

        param_list[[lms]] <- exp(
          log(param_per_treatment[[1]][[lms]]) +
          c(
            rep(0, times = num_null),
            stats::rnorm(num_affected,
                       mean = avg_effect,
                       sd = effect_size)
            )
        )

      }


    }

    param_per_treatment[[t]] <- param_list


  }



  # Loop over each feature
  sim_df <- dplyr::tibble()


  if(!("treatment" %in% colnames(metadf))){

    if(ntreatments == 1){
      metadf[["treatment"]] <- "treatment1"
    }else{
      stop("Simulating multiple treatments but you have not specified which samples
         belong to which treatments! Do so by adding a column named 'treatment'
         to your metadf.")
    }

  }


  treatment_vect <- unique(metadf$treatment)

  for(t in 1:ntreatments){

    treatment_value <- treatment_vect[t]
    submeta <- metadf %>%
      dplyr::filter(treatment == treatment_value)

    for(i in 1:nfeatures){

      ### Step 1, construct A
      param_extend <- c(0, sapply(param_per_treatment[[t]], function(x) x[i]))
      param_graph <- matrix(param_extend[graph + 1],
                            nrow = nrow(graph),
                            ncol = ncol(graph),
                            byrow = FALSE)

      A <- matrix(0,
                  nrow = nrow(graph) - 1,
                  ncol = ncol(graph) - 1)

      rownames(A) <- rownames(graph[-1,])
      colnames(A) <- rownames(graph[-1,])



      zero_index <- which(colnames(graph) == "0")

      diag(A) <- -rowSums(param_graph[-zero_index,])
      A <- A + t(param_graph[-zero_index,-zero_index])


      ### Step 2: infer general solution

      Rss <- solve(a = A,
                   b = -param_graph[zero_index,-zero_index])


      ev <- eigen(A)

      lambda <- ev$values
      V<- ev$vectors


      ### Step 3: Infer data for actual measured species
      all_ss <- c()
      all_fns <- c()
      features <- c()
      sample_names <- c()
      for(s in seq_along(submeta$sample)){

        tl <- submeta$tl[s]
        sample <- submeta$sample[s]

        cs <- solve(V, -Rss)

        exp_lambda <- exp(lambda*tl)

        scaled_eigenvectors <- V %*% diag(exp_lambda*cs)

        result_vector <- rowSums(scaled_eigenvectors) + Rss

        names(result_vector) <- rownames(A)

        # Evaluate the formulas
        sample_formula <- formula_list[[sample]]

        measured_levels <- evaluate_formulas(result_vector, sample_formula)

        names(Rss) <- rownames(A)
        measured_ss <- evaluate_formulas(Rss, sample_formula)


        all_fns <- c(all_fns, measured_levels/measured_ss)
        all_ss <- c(all_ss, measured_ss)
        features <- c(features, names(measured_levels))
        sample_names <- c(sample_names, rep(sample, times = length(measured_levels)))

      }


      sample_details <- submeta %>%
        dplyr::select(-tl)



      sim_df <- dplyr::bind_rows(sim_df,
                                 dplyr::tibble(sample = sample_names,
                                               fn = all_fns,
                                               ss = all_ss,
                                               feature_type = features,
                                               feature = paste0("Gene", i)) %>%
                                   dplyr::inner_join(sample_details,
                                                     by = "sample"))


    }


  }


  ### Simulate one replicate of each feature and sample
  combined_cB <- dplyr::tibble()
  combined_gt <- dplyr::tibble()
  for(s in seq_along(metadf$sample)){


    sample_name <- metadf$sample[s]

    sample_sim <- sim_df %>%
      dplyr::filter(sample == sample_name) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(avg_reads = seqdepth*ss/sum(ss))

    # Add some read variance
    sample_sim$reads <- stats::rnbinom(n = nrow(sample_sim),
                                mu = sample_sim$avg_reads,
                                size = dispersion)

    # Add some fn variance
    sample_sim$fn_rep <- inv_logit(
      stats::rnorm(n = nrow(sample_sim),
            mean = logit(sample_sim$fn),
            sd = lfn_sd)
    )

    types <- unique(sample_sim$feature_type)
    num_types <- length(types)
    NF <- length(unique(sample_sim$feature))

    cB <- dplyr::tibble()
    gt <- dplyr::tibble()
    for(n in 1:num_types){

      ft <- types[n]

      feature_sim <- sample_sim %>%
        dplyr::filter(feature_type == ft)


      simdata <- SimulateOneRep(nfeatures = NF,
                                read_vect = feature_sim$reads,
                                fn_vect = feature_sim$fn_rep,
                                sample_name = sample_name,
                                ...)

      cB <- dplyr::bind_rows(cB, simdata$cB %>%
                        dplyr::rename(!!ft := feature))

      gt <- dplyr::bind_rows(gt, simdata$ground_truth %>%
                        dplyr::rename(!!ft := feature))

    }

    cB <- cB %>%
      dplyr::mutate(dplyr::across(dplyr::all_of(types), ~tidyr::replace_na(.x, unassigned_name)))

    gt <- gt %>%
      dplyr::mutate(dplyr::across(dplyr::all_of(types), ~tidyr::replace_na(.x, unassigned_name)))


    combined_cB <- dplyr::bind_rows(combined_cB, cB)
    combined_gt <- dplyr::bind_rows(combined_gt, gt)

  }

  # Get parameter truths
  parameter_truth <- dplyr::tibble()
  for(t in 1:ntreatments){
    param_list <- param_per_treatment[[t]]
    names(param_list) <- paste0('true_k', 1:length(log_means))
    parameter_truth_t <- dplyr::as_tibble(param_list)
    parameter_truth_t$feature <- paste0('Gene', 1:nrow(parameter_truth_t))
    parameter_truth_t$treatment <- treatment_vect[t]

    parameter_truth <- parameter_truth_t %>%
      dplyr::bind_rows(parameter_truth)

  }


  # Form final ground truth
  gt_list <- list(replicate_truth = combined_gt,
                  avgfn_truth = sim_df,
                  parameter_truth = parameter_truth)

  output <- list(cB = combined_cB,
                 ground_truth = gt_list)

}


####################
# HELPER FUNCTIONS #
####################


# Function for relating modeled species to measured species
evaluate_formulas <- function(original_vector, formulas) {


  new_vector <- numeric()

  for (formula in formulas) {
    response <- as.character(formula[[2]])
    terms <- all.vars(formula[[3]])
    expr <- formula[[3]]
    env <- list2env(as.list(original_vector))
    value <- eval(expr, envir = env)
    new_vector[response] <- value
  }

  return(new_vector)

}

generate_pattern <- function(N) {
  base_letters <- LETTERS
  result <- character(0)  # Initialize an empty vector

  # Generate pattern until the required length N is achieved
  repeat_index <- 1
  while(length(result) < N) {
    result <- c(result, sapply(base_letters, function(letter) paste(rep(letter, repeat_index), collapse = "")))
    repeat_index <- repeat_index + 1
  }

  # Subset to first N elements
  result <- result[1:N] %>% unname()

  return(result)
}



create_odemodels_internally <- function(){

  ## nuc2cyto
  graph <- matrix(c(0, 1, 0,
                    0, 0, 2,
                    3, 0, 0),
                  nrow = 3,
                  ncol = 3,
                  byrow = TRUE)

  colnames(graph) <- c("0", "N", "C")
  rownames(graph) <- colnames(graph)

  total_list <- list(GF ~ C + N)
  nuc_list <- list(GF ~ N)
  cyt_list <- list(GF ~ C)

  model_list <- list(nuc2cyto = list(
    graph = graph,
    formulas = list(
      total = total_list,
      nuclear = nuc_list,
      cytoplasm = cyt_list
    )
  )
  )


  ## preRNA
  graph <- matrix(c(0, 1, 0,
                    0, 0, 2,
                    3, 0, 0),
                  nrow = 3,
                  ncol = 3,
                  byrow = TRUE)

  colnames(graph) <- c("0", "P", "M")
  rownames(graph) <- colnames(graph)

  total_list <- list(GF ~ P,
                     XF ~ M)

  model_list[["preRNA"]] <-
    list(
      graph = graph,
      formulas = list(
        total = total_list
      )
    )


  ## preRNAwithdeg

  graph <- matrix(c(0, 1, 0,
                    3, 0, 2,
                    4, 0, 0),
                  nrow = 3,
                  ncol = 3,
                  byrow = TRUE)

  colnames(graph) <- c("0", "P", "M")
  rownames(graph) <- colnames(graph)

  total_list <- list(GF ~ P,
                     XF ~ M)

  model_list[["preRNAwithPdeg"]] <-
    list(
      graph = graph,
      formulas = list(
        total = total_list
      )
    )


  ## nuc2cytowithNdeg
  graph <- matrix(c(0, 1, 0,
                    3, 0, 2,
                    4, 0, 0),
                  nrow = 3,
                  ncol = 3,
                  byrow = TRUE)

  colnames(graph) <- c("0", "N", "C")
  rownames(graph) <- colnames(graph)

  total_list <- list(GF ~ C + N)
  nuc_list <- list(GF ~ N)
  cyt_list <- list(GF ~ C)

  model_list[["nuc2cytowithNdeg"]] <- list(
    graph = graph,
    formulas = list(
      total = total_list,
      nuclear = nuc_list,
      cytoplasm = cyt_list
    )
  )


  ## subtlseq
  graph <- matrix(c(0, 1, 0, 0, 0,
                    0, 0, 2, 0, 0,
                    0, 0, 0, 3, 0,
                    5, 0, 0, 0, 4,
                    5, 0, 0, 0, 0),
                  nrow = 5,
                  ncol = 5,
                  byrow = TRUE)

  colnames(graph) <- c("0", "CH", "NP", "CY", "PL")
  rownames(graph) <- colnames(graph)


  chr_list <- list(GF ~ CH)
  nuc_list <- list(GF ~ NP + CH)
  cyt_list <- list(GF ~ CY + PL)
  poly_list <- list(GF ~ PL)
  tot_list <- list(GF ~ CH + NP + CY + PL)


  model_list[["subtlseq"]] <- list(
    graph = graph,
    formulas = list(
      total = tot_list,
      chromatin = chr_list,
      nuclear = nuc_list,
      cytoplasm = cyt_list,
      polysome = poly_list
    )
  )


  ## nuc2cytowithpreRNA

  # graph
  graph <- matrix(c(0, 1, 0, 0, 0,
                    0, 0, 2, 3, 0,
                    0, 0, 0, 0, 4,
                    0, 0, 0, 0, 5,
                    6, 0, 0, 0, 0),
                  nrow = 5,
                  ncol = 5,
                  byrow = TRUE)

  colnames(graph) <- c("0", "NP", "NM", "CP","CM")
  rownames(graph) <- colnames(graph)

  # formula list
  total_list <- list(GF ~ NP + CP,
                     XF ~ NM + CM)
  nuc_list <- list(GF ~ NP,
                   XF ~ NM)
  cyt_list <- list(GF ~ CP,
                   XF ~ CM)

  model_list[["nuc2cytowithpreRNA"]] <-
    list(
      graph = graph,
      formulas = list(
        total = total_list,
        nuclear = nuc_list,
        cytoplasm = cyt_list
      )
    )

  return(model_list)

}


# Determine treatment_effects df if not provided
automatic_treatment_effects <- function(nt, np,
                                        default_mean,
                                        default_sd,
                                        default_affected){

    treatment_ids <- rep(1:nt, each = np)
    parameter_ids <- rep(1:np, times = nt)

    treatment_effects <- dplyr::tibble(
      parameter_index = parameter_ids,
      treatment_index = treatment_ids,
      mean = default_mean,
      sd = default_sd,
      fraction_affected = default_affected
    )

    return(treatment_effects)

}

