% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/disteg.R
\name{disteg}
\alias{disteg}
\title{Calculate distance between two gene expression data sets}
\usage{
disteg(cross, pheno, pmark, min.genoprob = 0.99, k = 20,
  min.classprob = 0.8, classprob2drop = 1, repeatKNN = TRUE,
  max.selfd = 0.3, phenolabel = "phenotype", weightByLinkage = FALSE,
  map.function = c("haldane", "kosambi", "c-f", "morgan"),
  verbose = TRUE)
}
\arguments{
\item{cross}{An object of class \code{"cross"} containing data for a QTL
experiment.  See the help file for \code{\link[qtl]{read.cross}} in the
R/qtl package (\url{http://www.rqtl.org}).  There must be a phenotype named
\code{"id"} or \code{"ID"} that contains the individual identifiers.}

\item{pheno}{A data frame of phenotypes (generally gene expression data),
stored as individuals x phenotypes.  The row names must contain individual
identifiers.}

\item{pmark}{Pseudomarkers that are closest to the genes in \code{pheno}, as
output by \code{\link{find.gene.pseudomarker}}.}

\item{min.genoprob}{Threshold on genotype probabilities; if maximum
probability is less than this, observed genotype taken as \code{NA}.}

\item{k}{Number of nearest neighbors to consider in forming a k-nearest
neighbor classifier.}

\item{min.classprob}{Minimum proportion of neighbors with a common class to
make a class prediction.}

\item{classprob2drop}{If an individual is inferred to have a genotype
mismatch with classprob > this value, treat as an outlier and drop from the
analysis and then repeat the KNN construction without it.}

\item{repeatKNN}{If TRUE, repeat k-nearest neighbor a second time, after
omitting individuals who seem to not be self-self matches}

\item{max.selfd}{Min distance from self (as proportion of mismatches between
observed and predicted eQTL genotypes) to be excluded from the second round
of k-nearest neighbor.}

\item{phenolabel}{Label for expression phenotypes to place in the output
distance matrix.}

\item{weightByLinkage}{If TRUE, weight the eQTL to account for their
relative positions (for example, two tightly linked eQTL would each count
about 1/2 of an isolated eQTL)}

\item{map.function}{Used if \code{weightByLinkage} is TRUE}

\item{verbose}{if TRUE, give verbose output.}
}
\value{
A matrix with \code{nind(cross)} rows and \code{nrow(pheno)}
columns, containing the distances.  The individual IDs are in the row and
column names.  The matrix is assigned class \code{"lineupdist"}.

The names of the genes that were used to construct the classifier are saved
in an attribute \code{"retained"}.

The observed and inferred eQTL genotypes are saved as attributes
\code{"obsg"} and \code{"infg"}.

The denominators of the proportions that form the inter-individual distances
are in the attribute \code{"denom"}.
}
\description{
Calculate a distance between all pairs of individuals for two gene
expression data sets
}
\details{
We consider the expression phenotypes in batches, by which pseudomarker they
are closest to.  For each batch, we pull the genotype probabilities at the
corresponding pseudomarker and use the individuals that are in common
between \code{cross} and \code{pheno} and whose maximum genotype probability
is above \code{min.genoprob}, to form a classifier of eQTL genotype from
expression values, using k-nearest neighbor (the function
\code{\link[class]{knn}}). The classifier is applied to all individuals with
expression data, to give a predicted eQTL genotype. (If the proportion of
the k nearest neighbors with a common class is less than
\code{min.classprob}, the predicted eQTL genotype is left as \code{NA}.)

If \code{repeatKNN} is TRUE, we repeat the construction of the k-nearest
neighbor classifier after first omitting individuals whose proportion of
mismatches between observed and inferred eQTL genotypes is greater than
\code{max.selfd}.

Finally, we calculate the distance between the observed eQTL genotypes for
each individual in \code{cross} and the inferred eQTL genotypes for each
individual in \code{pheno}, as the proportion of mismatches between the
observed and inferred eQTL genotypes.

If \code{weightByLinkage} is \code{TRUE}, we use weights on the mismatch
proportions for the various eQTL, taking into account their linkage. Two
tightly linked eQTL will each be given half the weight of a single isolated
eQTL.
}
\examples{
library(qtl)

# load example data
data(f2cross, expr1, pmap, genepos)
\dontshow{
keep <- c(1:20, 197, 553, 573, 740, 794, 822, 1474, 1522,
          1591, 1645, 2080, 2643, 2984, 3089, 3672, 4010, 4039,
          4159, 4191, 4198, 4213, 4401, 4544, 4593, 4925)
expr1 <- expr1[,keep]
genepos <- genepos[keep,]}

# calculate QTL genotype probabilities
f2cross <- calc.genoprob(f2cross, step=1)

# find nearest pseudomarkers
pmark <- find.gene.pseudomarker(f2cross, pmap, genepos)

# line up individuals
id <- findCommonID(f2cross, expr1)

# calculate LOD score for local eQTL
locallod <- calc.locallod(f2cross[,id$first], expr1[id$second,], pmark)

# take those with LOD > 25
expr1s <- expr1[,locallod>25,drop=FALSE]

# calculate distance between individuals
#     (prop'n mismatches between obs and inferred eQTL geno)
d <- disteg(f2cross, expr1s, pmark)

# plot distances
plot(d)

# summary of apparent mix-ups
summary(d)

# plot of classifier for and second eQTL
par(mfrow=c(2,1), las=1)
plotEGclass(d)
plotEGclass(d, 2)

}
\seealso{
\code{\link{distee}}, \code{\link{summary.lineupdist}},
\code{\link{pulldiag}}, \code{\link{omitdiag}}, \code{\link{findCommonID}},
\code{\link{find.gene.pseudomarker}}, \code{\link{calc.locallod}},
\code{\link{plot.lineupdist}}, \code{\link[class]{knn}},
\code{\link{plotEGclass}}
}
\author{
Karl W Broman, \email{broman@wisc.edu}
}
\keyword{utilities}
