% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/bigpca.R
\name{subcor.select}
\alias{subcor.select}
\title{Selection of the most correlated variable subset}
\usage{
subcor.select(bigMat, keep = 0.05, rows = TRUE, hi.cor = TRUE,
  dir = getwd(), random = TRUE, ram.gb = 0.1)
}
\arguments{
\item{bigMat}{a big.matrix, matrix or any object accepted by get.big.matrix()}

\item{keep}{numeric, by default a proportion (decimal) of the original number of rows/columns to choose
for the subset. Otherwise if an integer>2 then will assume this is the size of the desired subset,
e.g, for a dataset with 10,000 rows where you want a subset size of 1,000 you could set 'keep' as
either 0.1 or 1000.}

\item{rows}{logical, whether the subset should be of the rows of bigMat. If rows=FALSE, then
the subset is chosen from columns, would be equivalent to calling subpc.select(t(bigMat)),
but avoids actually performing the transpose which can save time for large matrices.}

\item{hi.cor}{logical, whether to choose the most correlated (TRUE) or least correlated subset (FALSE).}

\item{dir}{the directory containing the bigMat backing file (e.g, parameter for get.big.matrix()).}

\item{random}{logical, passed to uniform.select(), whether to take a random or uniform selection
of columns (or rows if rows=FALSE) to run the subset PCA.}

\item{ram.gb}{maximum size of the matrix in gigabytes for the subset PCA, 0.1GB is the default
which should result in minimal processing time on a typical system. Increasing this
increases the processing time, but also the representativeness of the subset chosen. Note
that some very large matrices will not be able to be processed by this function unless
this parameter is increased; basically if the dimension being thinned is more than 5% of
this memory limit (see estimate.memory() from NCmisc).}
}
\value{
A set of row or column indexes (depents on 'rows' parameter) of the most inter-correlated
 (or least) variables in the matrix.
}
\description{
Returns a subset (size='keep') of row or column numbers that are most correlated to other
variables in the dataset (or if hi.cor=F), then those that are least correlated.
This function performs cor() on a small subset of columns and all rows (when rows=TRUE, or vice
 -versa when rows=FALSE), and selects rows (rows=TRUE) with greatest/least absolute sum of correlations.
}
\examples{
mat <- matrix(rnorm(200*2000),ncol=200)
bmat <- as.big.matrix(mat)
ii1 <- subcor.select(bmat,.05,rows=TRUE) # thin down to 5\% of the rows
ii2 <- subcor.select(bmat,45,rows=FALSE) # thin down to 45 columns
prv(ii1,ii2)
# show that rows=T is equivalent to rows=F of the transpose (random must be FALSE)
ii1 <- subcor.select(mat,.4,rows=TRUE,random=FALSE)
ii2 <- subcor.select(t(mat),.4,rows=FALSE,random=FALSE)
print(all.equal(ii1,ii2))
}
\author{
Nicholas Cooper
}
\seealso{
\code{\link{thin}}, \code{\link{uniform.select}}, \code{\link{get.big.matrix}}
}

