% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/cluster_fun.R
\name{cluster_on_missing_prop}
\alias{cluster_on_missing_prop}
\title{Cluster Samples Based on Missingness Proportions}
\usage{
cluster_on_missing_prop(
  prop_matrix,
  n_clusters = NULL,
  seed = NULL,
  k_neighbors = NULL,
  leiden_resolution = 0.25,
  use_snn = TRUE,
  leiden_objective = "CPM",
  metric = "euclidean",
  scale_features = FALSE
)
}
\arguments{
\item{prop_matrix}{Matrix or data frame where \strong{rows are samples} and
\strong{columns are features}, entries are missingness proportions in \verb{[0,1]}.
Can be created with \code{create_missingness_prop_matrix()}.}

\item{n_clusters}{Integer; number of clusters for KMeans. If \code{NULL}, uses
Leiden (default: \code{NULL}).}

\item{seed}{Integer; random seed for KMeans reproducibility (default: \code{NULL}).}

\item{k_neighbors}{Integer; Leiden minimum cluster size. If \code{NULL}, Python
default is used (default: \code{NULL}).}

\item{leiden_resolution}{Numeric; Leiden cluster selection threshold
(default: \code{0.25}).}

\item{use_snn}{Logical; whether to use shared nearest neighbors (optional).}

\item{leiden_objective}{Character; Leiden optimization objective (optional).}

\item{metric}{Character; distance metric. Options include:
\code{
    "euclidean",
    "cosine"
  }
(default: \code{"euclidean"}).}

\item{scale_features}{Logical; whether to standardize \strong{feature columns}
before clustering samples (default: \code{FALSE}).}
}
\value{
A list with:
\itemize{
\item \code{clusters}: Integer vector of cluster assignments per \strong{sample}.
\item \code{silhouette_score}: Numeric silhouette score, or \code{NULL}
if not computable.
}
}
\description{
Groups \strong{samples} with similar patterns of missingness across features using
either K-means clustering (when \code{n_clusters} is specified) or Leiden
(when \code{n_clusters} is \code{NULL}). This is useful for detecting cohorts with
shared missing-data behavior (e.g., site/batch effects).
}
\examples{
set.seed(123)

dat <- data.frame(
  sample_id = paste0("s", 1:12),
  # Two features measured at 3 timepoints each -> proportions by feature
  A_1 = c(NA, rnorm(11)),
  A_2 = c(NA, rnorm(11)),
  A_3 = rnorm(12),
  B_1 = rnorm(12),
  B_2 = c(rnorm(10), NA, NA),
  B_3 = rnorm(12)
)

pm <- create_missingness_prop_matrix(
  dat,
  index_col = "sample_id",
  repeat_feature_names = c("A", "B")
)

## cluster_on_missing_prop requires a working Python environment via reticulate
## Examples are wrapped in try() to avoid failures on CRAN check systems
try({
res <- cluster_on_missing_prop(
  pm,
  n_clusters = 2,
  metric = "cosine",
  scale_features = TRUE
)

table(res$clusters)
res$silhouette_score
})

}
