% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/biglasso.R
\name{big_spLinReg}
\alias{big_spLinReg}
\title{Sparse linear regression}
\usage{
big_spLinReg(X, y.train, ind.train = rows_along(X), ind.col = cols_along(X),
  covar.train = NULL, ncores = 1, ...)
}
\arguments{
\item{X}{A \link[=FBM-class]{FBM}.}

\item{y.train}{Vector of responses, corresponding to \code{ind.train}.}

\item{ind.train}{An optional vector of the row indices that are used,
for the training part. If not specified, all rows are used.
\strong{Don't use negative indices.}}

\item{ind.col}{An optional vector of the column indices that are used.
If not specified, all columns are used. \strong{Don't use negative indices.}}

\item{covar.train}{Matrix of covariables to be added in each model to correct
for confounders (e.g. the scores of PCA), corresponding to \code{ind.train}.
Default is \code{NULL} and corresponds to only adding an intercept to each model.}

\item{ncores}{Number of cores used. Default doesn't use parallelism.
You may use \link{nb_cores}.}

\item{...}{Arguments passed on to \code{COPY_biglasso_main}
\describe{
  \item{alpha}{The elastic-net mixing parameter that controls the relative
contribution from the lasso (l1) and the ridge (l2) penalty. The penalty is
defined as \deqn{ \alpha||\beta||_1 + (1-\alpha)/2||\beta||_2^2.}
\code{alpha = 1} is the lasso penalty and \code{alpha} in between \code{0}
(\code{1e-6}) and \code{1} is the elastic-net penalty. Default is \code{0.5}.}
  \item{lambda.min}{The smallest value for lambda, \strong{as a fraction of
lambda.max}. Default is \code{.0001} if the number of observations is larger than
the number of variables and \code{.001} otherwise.}
  \item{nlambda}{The number of lambda values. Default is \code{200}.}
  \item{eps}{Convergence threshold for inner coordinate descent.
The algorithm iterates until the maximum change in the objective after any
coefficient update is less than \code{eps} times the null deviance.
Default value is \code{1e-7}.}
  \item{max.iter}{Maximum number of iterations. Default is \code{1000}.}
  \item{dfmax}{Upper bound for the number of nonzero coefficients. Default is
\code{20e3} because, for large data sets, computational burden may be
heavy for models with a large number of nonzero coefficients.}
  \item{warn}{Return warning messages for failures to converge and model
saturation? Default is \code{FALSE}.}
  \item{K}{Number of sets used in the Cross-Model Selection and Averaging
(CMSA) procedure. Default is \code{10}.}
  \item{ind.sets}{Integer vectors of values between \code{1} and \code{K} specifying
which set each index of the training set is in. Default randomly assigns
these values.}
  \item{return.all}{Whether to return coefficients for all lambda values.
Default is \code{FALSE} and returns only coefficients which maximize prediction
on the corresponding validation set.}
  \item{nlam.min}{Minimum number of lambda values to investigate. Default is \code{50}.}
  \item{n.abort}{Number of lambda values for which prediction on the validation
set must decrease before stopping. Default is \code{10}.}
}}
}
\value{
Return an object of class \code{big_sp_best_list} (a list of K elements),
which has a method \code{predict} that can compute K vectors of predictions,
which could be combined with e.g. \code{rowMeans}. See details.
}
\description{
Fit lasso penalized linear regression path for a Filebacked Big Matrix.
Covariates can be added to correct for confounders.
}
\details{
\strong{This is a modified version of one function of
\href{https://github.com/YaohuiZeng/biglasso}{package biglasso}}.
It adds the possibility to train models with covariables and use many
types of \code{FBM} (not only \code{double} ones).
Yet, it only corresponds to \code{screen = "SSR"} (Sequential Strong Rules).

Also, to remove the choice of the lambda parameter, we introduce the
Cross-Model Selection and Averaging (CMSA) procedure:
\enumerate{
\item This function separates the training set in \code{K} folds (e.g. 10).
\item \strong{In turn},
\itemize{
\item each fold is considered as an inner validation set and the others
(K - 1) folds form an inner training set,
\item the model is trained on the inner training set and the corresponding
predictions (scores) for the inner validation set are computed,
\item the vector of scores which maximizes log-likelihood is determined,
\item the vector of coefficients corresponding to the previous vector of
scores is chosen.
}
\item The \code{K} resulting vectors of coefficients can then be combined into one
vector (see \link{get_beta}) or you can just combine the predictions
(e.g. using \code{predict} followed by \code{rowMeans}).
}
}
\examples{
set.seed(1)

# simulating some data
N <- 230
M <- 730
X <- FBM(N, M, init = rnorm(N * M, sd = 5), type = "integer")
y <- X[, 1:5] + rnorm(N)
covar <- matrix(rnorm(N * 3), N)

ind.train <- sort(sample(nrow(X), 150))
ind.test <- setdiff(rows_along(X), ind.train)

test <- big_spLinReg(X, y[ind.train], ind.train = ind.train,
                     covar.train = covar[ind.train, ],
                     warn = FALSE)
# K = 10 predictions
str(preds <- predict(test, X, ind.row = ind.test, covar.row = covar[ind.test, ]))
# Combine them
preds2 <- rowMeans(preds)

plot(preds2, y[ind.test], pch = 20); abline(0, 1, col = "red")
}
\references{
Tibshirani, R., Bien, J., Friedman, J., Hastie, T.,
Simon, N., Taylor, J. and Tibshirani, R. J. (2012),
Strong rules for discarding predictors in lasso-type problems.
Journal of the Royal Statistical Society:
Series B (Statistical Methodology), 74: 245–266.
\url{http://dx.doi.org/10.1111/j.1467-9868.2011.01004.x}.

Zeng, Y., and Breheny, P. (2016). The biglasso Package: A Memory- and
Computation-Efficient Solver for Lasso Model Fitting with Big Data in R.
arXiv preprint arXiv:1701.05936. \url{https://arxiv.org/abs/1701.05936}.
}
\seealso{
\link[glmnet:glmnet]{glmnet} \link[biglasso:biglasso]{biglasso}
}
