% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DSC_Sample.R
\name{DSC_Sample}
\alias{DSC_Sample}
\title{Extract a Fixed-size Sample from a Data Stream}
\usage{
DSC_Sample(k = 100, biased = FALSE)
}
\arguments{
\item{k}{the number of points to be sampled from the stream.}

\item{biased}{if \code{FALSE} then a regular (unbiased) reservoir sampling
is used. If true then the sample is biased towards keeping more recent data
points (see Details section).}
}
\value{
An object of class \code{DSC_Sample} (subclass of \link{DSC},
\link{DSC_R}, \link{DSC_Micro}).
}
\description{
Micro Clusterer.
Extracts a sample form a data stream using Reservoir Sampling (\link{DSAggregate_Sample}).  The sample
is stored as a set of micro-clusters to be compatible with other data DSC
stream clustering algorithms.
}
\details{
If \code{biased = FALSE} then the reservoir sampling algorithm by McLeod and
Bellhouse (1983) is used. This sampling makes sure that each data point has
the same chance to be sampled. All sampled points will have a weight of 1.
Note that this might not be ideal for an evolving stream since very old data
points have the same chance to be in the sample as newer points.

If \code{bias = TRUE} then sampling prefers newer points using the modified
reservoir sampling algorithm 2.1 by Aggarwal (2006). New points are always
added. They replace a random point in the reservoir with a probability of
reservoir size over \code{k}. This an exponential bias function of
\code{2^{-lambda}} with \code{lambda = 1 / k}.
}
\examples{
stream <- DSD_Gaussians(k = 3, d = 2, noise = 0.05)

sample <- DSC_Sample(k = 20)
update(sample, stream, 500)
sample

# plot micro-clusters
plot(sample, stream)

# recluster the sample with k-means
kmeans <- DSC_Kmeans(k = 3)
recluster(kmeans, sample)
plot(kmeans, stream)

# sample from an evolving stream
stream <- DSD_Benchmark(1)
sample <- DSC_Sample(k = 20)
update(sample, stream, 1000)

plot(sample, stream)
# Note: the clusters move from left to right and the sample keeps many
# outdated points

# use a biased sample to keep more recent data points
stream <- DSD_Benchmark(1)
sample <- DSC_Sample(k = 20, biased = TRUE)
update(sample, stream, 1000)
plot(sample, stream)
}
\references{
Vitter, J. S. (1985): Random sampling with a reservoir.
\emph{ACM Transactions on Mathematical Software,} 11(1), 37-57.

McLeod, A.I., Bellhouse, D.R. (1983): A Convenient Algorithm for Drawing a
Simple Random Sample. \emph{Applied Statistics,} 32(2), 182-184.

Aggarwal C. (2006) On Biased Reservoir Sampling in the Presence of Stream
Evolution. \emph{International Conference on Very Large Databases
(VLDB'06).} 607-618.
}
\seealso{
Other DSC_Micro: 
\code{\link{DSC_BICO}()},
\code{\link{DSC_BIRCH}()},
\code{\link{DSC_DBSTREAM}()},
\code{\link{DSC_DStream}()},
\code{\link{DSC_Micro}()},
\code{\link{DSC_Window}()},
\code{\link{DSC_evoStream}()}
}
\author{
Michael Hahsler
}
\concept{DSC_Micro}
