% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/simulations.R
\name{SimulateRegression}
\alias{SimulateRegression}
\title{Data simulation for multivariate regression}
\usage{
SimulateRegression(
  n = 100,
  pk = 10,
  N = 3,
  family = "gaussian",
  ev_xz = 0.8,
  adjacency_x = NULL,
  nu_within = 0.1,
  theta_xz = NULL,
  nu_xz = 0.2,
  theta_zy = NULL,
  nu_zy = 0.5,
  eta = NULL,
  eta_set = c(-1, 1),
  v_within = c(0.5, 1),
  v_sign = c(-1, 1),
  continuous = TRUE,
  pd_strategy = "diagonally_dominant",
  ev_xx = NULL,
  scale_ev = TRUE,
  u_list = c(1e-10, 1),
  tol = .Machine$double.eps^0.25
)
}
\arguments{
\item{n}{number of observations in the simulated data.}

\item{pk}{vector with the number of predictors in each independent block of
variables in \code{xdata}. The number of independent blocks, which
determines the maximum number of orthogonal latent variables that can be
simulated, is given by \code{length(pk)}.}

\item{N}{number of classes of the categorical outcome. Only used if
\code{family="multinomial"}.}

\item{family}{type of outcome. If \code{family="gaussian"}, normally
distributed outcomes are simulated. If \code{family="binomial"} or
\code{family="multinomial"}, binary outcome(s) are simulated from a
multinomial distribution where the probability is defined from a linear
combination of normally distributed outcomes.}

\item{ev_xz}{vector of the expected proportions of explained variances for
each of the orthogonal latent variables. It must contain values in ]0,1[,
and must be a vector of length \code{length(pk)} or a single value to
generate latent variables with the same expected proportion of explained
variance.}

\item{adjacency_x}{optional matrix encoding the conditional independence
structure between predictor variables in \code{xdata}. This argument must
be a binary symmetric matrix of size \code{sum(pk)} with zeros on the
diagonal.}

\item{nu_within}{expected density (number of edges over the number of node
pairs) of the conditional independence graph in the within-group blocks for
predictors. For independent predictors, use \code{nu_within=0}. This
argument is only used if \code{adjancency_x} is not provided.}

\item{theta_xz}{optional binary matrix encoding the predictor variables from
\code{xdata} (columns) contributing to the definition of the orthogonal
latent outcomes from \code{zdata} (rows).}

\item{nu_xz}{expected proportion of relevant predictors over the total number
of predictors to be used for the simulation of the orthogonal latent
outcomes. This argument is only used if \code{theta_xz} is not provided.}

\item{theta_zy}{optional binary matrix encoding the latent variables from
\code{zdata} (columns) contributing to the definition of the observed
outcomes from \code{ydata} (rows). This argument must be a square matrix of
size \code{length(pk)}. If \code{theta_zy} is a diagonal matrix, each
latent variable contributes to the definition of one observed outcome so
that there is a one-to-one relationship between latent and observed
outcomes (i.e. they are collinear). Nonzero off-diagonal elements in
\code{theta_zy} introduce some correlation between the observed outcomes by
construction from linear combinations implicating common latent outcomes.
This argument is only used if \code{eta} is not provided.}

\item{nu_zy}{probability for each of the off-diagonal elements in
\code{theta_zy} to be a 1. If \code{nu_zy=0}, \code{theta_zy} is a diagonal
matrix. This argument is only used if \code{theta_zy} is not provided.}

\item{eta}{optional matrix of coefficients used in the linear combination of
latent outcomes to generate observed outcomes.}

\item{eta_set}{vector defining the range of values from which \code{eta} is
sampled. This argument is only used if \code{eta} is not provided.}

\item{v_within}{vector defining the (range of) nonzero entries in the
diagonal blocks of the precision matrix. These values must be between -1
and 1 if \code{pd_strategy="min_eigenvalue"}. If \code{continuous=FALSE},
\code{v_within} is the set of possible precision values. If
\code{continuous=TRUE}, \code{v_within} is the range of possible precision
values.}

\item{v_sign}{vector of possible signs for precision matrix entries. Possible
inputs are: \code{-1} for positive partial correlations, \code{1} for
negative partial correlations, or \code{c(-1, 1)} for both positive and
negative partial correlations.}

\item{continuous}{logical indicating whether to sample precision values from
a uniform distribution between the minimum and maximum values in
\code{v_within} (diagonal blocks) or \code{v_between} (off-diagonal blocks)
(\code{continuous=TRUE}) or from proposed values in \code{v_within}
(diagonal blocks) or \code{v_between} (off-diagonal blocks)
(\code{continuous=FALSE}).}

\item{pd_strategy}{method to ensure that the generated precision matrix is
positive definite (and hence can be a covariance matrix). If
\code{pd_strategy="diagonally_dominant"}, the precision matrix is made
diagonally dominant by setting the diagonal entries to the sum of absolute
values on the corresponding row and a constant u. If
\code{pd_strategy="min_eigenvalue"}, diagonal entries are set to the sum of
the absolute value of the smallest eigenvalue of the precision matrix with
zeros on the diagonal and a constant u.}

\item{ev_xx}{expected proportion of explained variance by the first Principal
Component (PC1) of a Principal Component Analysis. This is the largest
eigenvalue of the correlation (if \code{scale_ev=TRUE}) or covariance (if
\code{scale_ev=FALSE}) matrix divided by the sum of eigenvalues. If
\code{ev_xx=NULL} (the default), the constant u is chosen by maximising the
contrast of the correlation matrix.}

\item{scale_ev}{logical indicating if the proportion of explained variance by
PC1 should be computed from the correlation (\code{scale_ev=TRUE}) or
covariance (\code{scale_ev=FALSE}) matrix. If \code{scale_ev=TRUE}, the
correlation matrix is used as parameter of the multivariate normal
distribution.}

\item{u_list}{vector with two numeric values defining the range of values to
explore for constant u.}

\item{tol}{accuracy for the search of parameter u as defined in
\code{\link[stats]{optimise}}.}
}
\value{
A list with: \item{xdata}{simulated predictor data.}
  \item{ydata}{simulated outcome data.} \item{proba}{simulated probability of
  belonging to each outcome class. Only used for \code{family="binomial"} or
  \code{family="multinomial"}.} \item{logit_proba}{logit of the simulated
  probability of belonging to each outcome class. Only used for
  \code{family="binomial"} or \code{family="multinomial"}.}
  \item{zdata}{simulated data for orthogonal latent outcomes.}
  \item{beta}{matrix of true beta coefficients used to generate outcomes in
  \code{ydata} from predictors in \code{xdata}.} \item{theta}{binary matrix
  indicating the predictors from \code{xdata} contributing to the definition
  of each of the outcome variables in \code{ydata}.} \item{eta}{matrix of
  coefficients used in the linear combination of latent variables from
  \code{zdata} to define observed outcomes in \code{ydata}.}
  \item{theta_zy}{binary matrix indicating the latent variables from
  \code{zdata} used in the definition of observed outcomes in \code{ydata}.}
  \item{xi}{matrix of true beta coefficients used to generate orthogonal
  latent outcomes in \code{zdata} from predictors in \code{xdata}.}
  \item{theta_xz}{binary matrix indicating the predictors from \code{xdata}
  contributing to the definition of each of the latent outcome variables in
  \code{zdata}.} \item{omega_xz}{precision matrix for variables in
  \code{xdata} and \code{zdata}.} \item{adjacency}{binary matrix encoding the
  conditional independence structure between variables from \code{xdata}
  (\code{var}), \code{zdata} (\code{latent}) and \code{ydata}
  (\code{outcome}).}
}
\description{
Simulates data with outcome(s) and predictors, where only a subset of the
predictors actually contributes to the definition of the outcome(s).
}
\details{
For a univariate outcome (\code{length(pk)=1}), the simulation is
  done in four steps where (i) predictors contributing to outcome definition
  are randomly sampled (with probability \code{nu_xz} for a given predictor
  to be picked), (ii) the conditional independence structure between the
  predictors is simulated (with probability \code{nu_within} for a given pair
  of predictors to be correlated, conditionally on all other variables),
  (iii) generation of a precision matrix (inverse covariance matrix) for all
  variables, where nonzero entries correspond to the predictors contributing
  to outcome definition or conditional correlation between the predictors,
  and (iv) data for both predictors and outcome is simulated from a single
  multivariate Normal distribution using the inverse precision matrix as
  covariance matrix.

  To ensure that the generated precision matrix \eqn{\Omega} is positive
  definite, the diagonal entries are defined as described in
  \code{\link{MakePositiveDefinite}}. The conditional variance of the outcome
  \eqn{\Omega_{YY}} is chosen so that the proportion of variance in the
  outcome that is explained by the predictors is \code{ev_xz}.

  For a multivariate outcome (\code{length(pk)>1}), we introduce independent
  groups of predictors and orthogonal latent variables (groups are defined in
  \code{pk}). Each latent variable is defined as a function of variables
  belonging to one group of predictors. The precision matrix is defined as
  described above for univariate outcomes. Subject to the re-ordering of its
  rows, this precision matrix is block-diagonal, encoding the independence
  between sets of variables made of (i) the groups of predictors, and (ii)
  their corresponding latent variable. The outcome variables are then
  constructed from a linear combination of the latent variables, allowing for
  contributing predictors belonging to different groups.

  The use of latent variables in the multivariate case ensures that we can
  control the proportion of variance in the latent variable explained by the
  predictors (\code{ev_xz}).
}
\examples{
oldpar <- par(no.readonly = TRUE)
par(mar = c(5, 5, 5, 5))

## Continuous outcomes

# Univariate outcome
set.seed(1)
simul <- SimulateRegression(pk = 15)
print(simul)
plot(simul)

# Multivariate outcome
set.seed(1)
simul <- SimulateRegression(pk = c(5, 7, 3))
print(simul)
plot(simul)

# Independent predictors
set.seed(1)
simul <- SimulateRegression(pk = c(5, 3), nu_within = 0)
print(simul)
plot(simul)

# Blocks of strongly inter-connected predictors
set.seed(1)
simul <- SimulateRegression(
  pk = c(5, 5), nu_within = 0.5,
  v_within = c(0.5, 1), v_sign = -1, continuous = TRUE, pd_strategy = "min_eigenvalue"
)
print(simul)
Heatmap(
  mat = cor(simul$xdata),
  col = c("navy", "white", "red"),
  legend_range = c(-1, 1)
)
plot(simul)


## Categorical outcomes

# Binary outcome
set.seed(1)
simul <- SimulateRegression(pk = 20, family = "binomial")
print(simul)
table(simul$ydata[, 1])

# Categorical outcome
set.seed(1)
simul <- SimulateRegression(pk = 20, family = "multinomial")
print(simul)
apply(simul$ydata, 2, sum)

par(oldpar)
}
\references{
\insertRef{ourstabilityselection}{fake}
}
\seealso{
Other simulation functions: 
\code{\link{SimulateAdjacency}()},
\code{\link{SimulateComponents}()},
\code{\link{SimulateGraphical}()}
}
\concept{simulation functions}
