# WARNING - Generated by {fusen} from dev/dereplicate-spectra.Rmd: do not edit by hand

#' Delineate clusters from a similarity matrix
#'
#' From a matrix of spectra similarity (e.g., with the cosine metric,
#' or Pearson product moment), infer the species clusters based on a
#' threshold **above** (or **equal to**) which spectra are considered alike.
#'
#' @param sim_matrix A *n* × *n* similarity matrix, with *n* the number of spectra. Columns should be named as the rows.
#' @param threshold A numeric value indicating the minimal similarity between two spectra. Adjust accordingly to the similarity metric used.
#'
#' @return A tibble of *n* rows for each spectra and 3 columns:
#' * `name`: the rownames of the similarity matrix indicating the spectra names
#' * `membership`: integers stating the cluster number to which the spectra belong to. It starts from 1 to _c_, the total number of clusters.
#' * `cluster_size`: integers indicating the total number of spectra in the corresponding cluster.
#'
#' @details The matrix is essentially a network
#'  where nodes are spectra and links exist between spectra only if the similarity
#'  between the spectra is above the threshold.
#'
#'  The original idea to find the cluster members comes from a [StackOverflow answer by the user
#'  ekstroem](https://stackoverflow.com/a/57613463). However, here the
#'  implementation differs in two way:
#'
#'  1. It relies on the connected components of the network instead of the fast greedy
#'   modularity algorithm.
#'  2. It uses base R functions to reduce the dependencies
#'
#'
#' @seealso For similarity metrics: [`coop::tcosine`](https://rdrr.io/cran/coop/man/cosine.html), [stats::cor], [`Hmisc::rcorr`](https://rdrr.io/cran/Hmisc/man/rcorr.html). For using taxonomic identifications for clusters : [delineate_with_identification]. For further analyses: [set_reference_spectra].
#' @export
#' @examples
#' # Toy similarity matrix between the six example spectra of
#' #  three species. The cosine metric is used and a value of
#' #  zero indicates dissimilar spectra and a value of one
#' #  indicates identical spectra.
#' cosine_similarity <- matrix(
#'   c(
#'     1, 0.79, 0.77, 0.99, 0.98, 0.98,
#'     0.79, 1, 0.98, 0.79, 0.8, 0.8,
#'     0.77, 0.98, 1, 0.77, 0.77, 0.77,
#'     0.99, 0.79, 0.77, 1, 1, 0.99,
#'     0.98, 0.8, 0.77, 1, 1, 1,
#'     0.98, 0.8, 0.77, 0.99, 1, 1
#'   ),
#'   nrow = 6,
#'   dimnames = list(
#'     c(
#'       "species1_G2", "species2_E11", "species2_E12",
#'       "species3_F7", "species3_F8", "species3_F9"
#'     ),
#'     c(
#'       "species1_G2", "species2_E11", "species2_E12",
#'       "species3_F7", "species3_F8", "species3_F9"
#'     )
#'   )
#' )
#' # Delineate clusters based on a 0.92 threshold applied
#' #  to the similarity matrix
#' delineate_with_similarity(cosine_similarity, threshold = 0.92)
delineate_with_similarity <- function(sim_matrix, threshold) {
  if (!is.matrix(sim_matrix)) {
    stop("The similarity matrix is not a matrix.")
  }
  if (nrow(sim_matrix) != ncol(sim_matrix)) {
    stop("The similarity matrix is not square: nrow != ncol.")
  }
  if (is.null(rownames(sim_matrix)) || is.null(colnames(sim_matrix))) {
    stop("The similarity matrix has no rownames or colnames.")
  }
  if (any(rownames(sim_matrix) != colnames(sim_matrix))) {
    stop("The similarity matrix has no identical names.")
  }

  # Spectra as nodes are connected only if similarity is above the threshold
  #
  name_gtr_eq_threshold <- function(vec, threshold) {
    if (!is.numeric(threshold)) {
      stop("The threshold provided is not a numeric.")
    }
    # Names of vec were checked above with the full matrix
    base::names(vec)[vec >= threshold]
  }
  # Extract elements of lists from nested lists and ensure sorted uniqueness
  gather_cluster_membership <- function(nested_list) {
    base::unlist(nested_list, use.names = FALSE) %>%
      base::unique() %>%
      base::sort() %>%
      paste(collapse = "|") %>%
      base::as.factor() %>%
      return()
  }
  # Delineate clusters by creating a vector of membership (including self) and friends-of-friends with recursive lists.
  # The vector is then rendered unique using factors and converted into integer id
  memberships <- apply(sim_matrix, 1, name_gtr_eq_threshold, threshold = threshold)

  lapply(memberships, function(x) memberships[x]) %>%
    sapply(gather_cluster_membership) %>%
    tibble::enframe(value = "membership") %>%
    dplyr::group_by(.data$membership) %>%
    dplyr::mutate(
      "membership" = base::as.integer(.data$membership),
      "cluster_size" = dplyr::n()
    ) %>%
    dplyr::ungroup() %>%
    return()
}
