data <- t_pks <- id <- sd <- n <- msg <- time <- hz <- hz_norm <- peak <- val <- ci <- bpm_ci <- val_ci <- state <- d_r <- d_f <- i <- keep <- path <- multi <- x <- y <- SD <- VAR <- var <- smoothed <- . <- NULL

#' heartbeatr utility function
#' @description
#' `heartbeatr-package` utility function
#'
#' @param path file path
#'
#' @return
#' A logical indicating if the path supplied corresponds to a valid PULSE file or not.
#'
#' @export
is.pulse <- function(path) {
	lines <- path %>%
		readr::read_lines(n_max = 50) %>%
		stringr::str_to_lower()

	has.eb <- lines %>%
		stringr::str_detect("www.electricblue.eu") %>%
		any()

	has.pulse <- lines %>%
		stringr::str_detect("pulse") %>%
		any()

	# return
	has.eb & has.pulse
}

is.pulse.tbl <- function(tbl) {
	lgl <- all(tibble::is_tibble(tbl))
	if (lgl) lgl <- (colnames(tbl)[1] == "time")
	if (lgl) lgl <- (class(tbl$time)[1] == "POSIXct")
	if (lgl) lgl <- (all(class(tbl[,-1] %>% unlist()) == "numeric"))
	lgl
}

is.pulse.multi <- function(path) {
	lines <- path %>%
		readr::read_lines(n_max = 50) %>%
		stringr::str_to_lower()  %>%
		stringr::str_detect("downloading device type") %>%
		any() %>%
		magrittr::not()
}

#' Get paths to pulse example files
#'
#' @description
#' `heartbeatr-package` comes bundled with several sample files in its inst/extdata directory. This function make them easy to access
#'
#' @param pattern Pattern to select one or more example files. Pattern is vectorized, so more than one value can be supplied. If NULL, all example files are listed.
#'
#' @return The full path to one or more example files, or the filenames of all example files available.
#' @export
#'
#' @seealso
#'  * [pulse_read()] can be used to read data from the example files
#'  * [PULSE()] is a wrapper function that executes all the steps needed to process PULSE data at once, and it can be called to read and process the example files
#'
#' @examples
#' # Get the paths to all example files
#' pulse_example()
pulse_example <- function(pattern = NULL) {
	filenames <- dir(system.file("extdata", package = "heartbeatr"), full.names = TRUE)
	if (is.null(pattern)) {
		filenames
	} else {
		pattern %>%
			purrr::map(~stringr::str_subset(filenames, .x)) %>%
			unlist()
	}
}

#' Normalize PULSE heartbeat rate estimates
#'
#' @description
#' Take the output from [`PULSE()`] and compute the normalized heartbeat rates. The normalization of heartbeat rates is achieved by calculating, for each individual (i.e., PULSE channel), the average heartbeat rate during a reference baseline period (ideally measured during acclimation, before the stress-inducing treatment is initiated).
#'
#' @section Details:
#' Normalizing heartbeat rates is important because even individuals from the same species, the same age cohort and subjected to the same treatment will have different basal heart beat frequencies. After normalizing, these differences are minimized, and the analysis can focus on the change of hear beat frequency relative to a reference period (the baseline period chosen) rather than on the absolute values of heart beat frequency - which can be misleading.
#'
#' The period chosen for the baseline doesn't need to be long - it's much more important that conditions (and hopefully heart beat frequencies) are as stable and least stressful as possible during that period.
#'
#' After normalization, heart beat frequencies during the baseline period will, by definition, average to `1`. Elsewhere, normalized heart beat frequencies represent ratios relative to the baseline: `2` represents a heart beat frequency double the basal frequency, while `0.5` indicates half of the basal frequency. This means that two individuals may experience a doubling of heart beat frequency throughout an experiment even if their absolute heart beat frequencies are markedly different from each other (e.g., individual 1 with hz = 0.6 at t0 and hz = 1.2 at t1, and individual 2 with hz = 0.8 at t0 and hz = 1.6 at t1, will both show hz_norm = 1 at t0 and hz_norm = 2 at t1).
#'
#' @section Different baseline periods for each channel:
#' `pulse_normalize` only allows setting a single baseline period. If different periods are needed for different channels or groups of channels, generate two or more subsets of `heart_rates` containing `heart_rates$id` that share the same baseline periods, normalize each independently and bind all data together at the end (see the examples section below).
#'
#' @param heart_rates the output from [PULSE()], [pulse_heart()], [pulse_doublecheck()] or [pulse_choose_keep].
#' @param FUN the function to be applied to normalize the data within the baseline period (defaults to [mean]; [median] may be more suited in some situations; any other function that returns a single numeric value is technically acceptable).
#' @param t0 either `NULL` (default), a [POSIXct] object or a character string that can be directly converted to a [POSIXct] object. Represents the beginning of the period to be used to establish the baseline heart beat frequency (same value is used for all channels). If set to `NULL`, the baseline period is set to the earliest timestamp available.
#' @param span_mins numeric, defaults to 10; number of minutes since `t0`, indicating the width of the baseline period (baseline from `t0` to `t0` + `span_mins` mins)
#' @param overwrite logical, defaults to `FALSE`; should the normalized values be stored in a different column (`hz_norm` if `overwrite = FALSE`; RECOMMENDED) or replace the data in the column `hz` (`overwrite = TRUE`; WARNING: the original `hz` values cannot be recovered).
#'
#' @return The same tibble provided as input, with an additional column `hz_norm` containing the normalized heart beat frequencies (`overwrite = FALSE`) or with the same number of columns and normalized data saved to the column `hz` (`overwrite = TRUE`).
#'
#' @export
#'
#' @seealso
#'  * [pulse_heart()], [pulse_doublecheck()] and [pulse_choose_keep()] are the functions that generate the input for `pulse_normalize`
#'  * [pulse_plot()] can be called to visualize the output from `pulse_normalize`
#'  * [PULSE()] is a wrapper function that executes all the steps needed to process PULSE data at once, and its output can also be passed on to `pulse_normalize`
#'
#' @examples
#' ## Begin prepare data ----
#' pulse_data_sub <- pulse_data
#' pulse_data_sub$data <- pulse_data_sub$data[,1:5]
#' pulse_data_split <- pulse_split(
#'    pulse_data_sub,
#'    window_width_secs = 30,
#'    window_shift_secs = 60,
#'    min_data_points = 0.8)
#' pulse_data_split <- pulse_optimize(pulse_data_split, multi = pulse_data$multi)
#' heart_rates <- pulse_heart(pulse_data_split)
#' heart_rates <- pulse_doublecheck(heart_rates)
#' ## End prepare data ----
#'
#' # Normalize data using the same period as baseline for all channels
#' pulse_normalize(heart_rates)
#'
#' # Using a different (complex) function
#' pulse_normalize(heart_rates, FUN = function(x) quantile(x, 0.4))
#'
#' # Apply different baseline periods to two groups of IDs
#' group_1 <- c("limpet_1", "limpet_2")
#' rbind(
#'   # group_1
#'   pulse_normalize(heart_rates[ (heart_rates$id %in% group_1), ], span_mins = 10),
#'   # all other IDs
#'   pulse_normalize(heart_rates[!(heart_rates$id %in% group_1), ], span_mins = 30)
#' )
pulse_normalize <- function(heart_rates, FUN = mean, t0 = NULL, span_mins = 10, overwrite = FALSE) {
	stopifnot(is.numeric(span_mins))
	stopifnot(length(span_mins) == 1)

	TZ <- lubridate::tz(heart_rates$time[1])
	if (is.null(t0)) t0 <- min(heart_rates$time)
	t0 <- as.POSIXct(t0, tz = TZ)

	# find the basal heart rate
	baseline <- heart_rates %>%
		dplyr::filter(dplyr::between(time, t0, t0 + span_mins * 60)) %>%
		dplyr::group_by(id) %>%
		dplyr::summarise(hz_norm = FUN(hz))

	# compute the normalized heartbeat rate
	heart_rates <- heart_rates %>%
		dplyr::left_join(baseline, by = "id") %>%
		dplyr::mutate(hz_norm = hz / hz_norm)

	if (overwrite) {
		heart_rates <- heart_rates %>%
			dplyr::mutate(hz = hz_norm) %>%
			dplyr::select(-hz_norm)
	}

	# return
	heart_rates
}

#' Summarise PULSE heartbeat rate estimates over new time windows
#'
#' @description
#' Take the output from [`PULSE()`] and summarise `hz` estimates over new user-defined time windows using `FUN` (a summary function). In effect, this procedure reduces the number of data points available over time.
#'
#' Note that the output of `pulse_summarise()` can be inspected with [`pulse_plot()`] but not [`pulse_plot_raw()`].
#'
#' @section Details:
#' The PULSE multi-channel system captures data continuously. When processing those data, users should aim to obtain estimates of heart beat frequency at a rate that conforms to their system's natural temporal variability, or risk running into oversampling (which has important statistical implications and must be avoided or explicitly handled).
#'
#' With this in mind, users can follow two strategies:
#'
#' *If, for example, users are targeting 1 data point every 5 mins...*
#'
#' * If the raw data is of good quality (i.e., minimal noise, signal wave with large amplitude), users can opt for a relatively narrow split_window (e.g, by setting `window_width_secs` in [PULSE()] (or [pulse_split()]) to `30` secs) and to only sample split_windows every 5 mins with `window_shift_secs = 300`. This means that data is processed in 5-mins split-windows where 30 secs of data are used and four and a half mins of data are skipped, yielding our target of 1 data point every 5 mins. Doing so will greatly speed up the processing of the data (less and smaller windows to work on), and the final output will immediately have the desired sample frequency. However, if any of the split_windows effectively analysed features a gap in the data or happens to coincide with the occasional drop in signal quality, those estimates of heartbeat rate will reflect that lack of quality (even if *better* data may be present in the four and a half mins of data that is being skipped). This strategy is usually used at the beginning to assess the dataset, and depending on the results, the more time-consuming strategy described next may have to be used instead.
#'
#' * If sufficient computing power is available and/or the raw data can't be guaranteed to be high quality from beginning to end, users can opt for a strategy that scans the entire dataset without skipping any data. This can be achieved by setting `window_width_secs` and `window_shift_secs` in [PULSE()] (or [pulse_split()]) to the same low value. In this case, if both parameters are set to `30` secs, processing will take significantly longer and each 5 mins of data will result in `10` data points. Then, `pulse_summarise` can be used with `span_mins = 5` to summarise the data points back to the target sample frequency. More importantly, if the right summary function is used, this strategy can greatly reduce the negative impact of spurious *bad* readings. For example, setting `FUN = median`, will reduce the contribution of values of `hz` that deviate from the center ("wrong" values) to the final heartbeat estimate for a given time window). Thus, if the computational penalty is bearable, this more robust strategy can prove useful.
#'
#' @inheritParams pulse_normalize
#' @param FUN a custom function, defaults to `median`; Note that `FUN` must take a vector of `numeric` values and output a single `numeric` value.
#' @param span_mins integer, in `mins`, defaults to `10`; expresses the width of the new summary windows
#' @param min_data_points numeric, defaults to `2`; value indicating the minimum number of data points in each new summarizing window. Windows covering less data points are discarded. If set to `0` (zero), no window is ever discarded.
#'
#' @return A similar tibble as the one provided for input, but fewer columns and rows. Among the columns now absent is the `data` column (raw data is no longer available). IMPORTANT NOTE: Despite retaining the same names, several columns present in the output now provide slightly different information (because they are recalculated for each summarizing window): `time` corresponds to the first time stamp of the summarizing window; `n` shows the number of valid original windows used by the summary function; `sd` represents the standard deviation of all heartbeat rate estimates within each summarizing window (and not the standard deviation of the intervals between each identified wave peak, as was the case in `heart_rates`); `ci` is the confidence interval of the new value for `hz`.
#'
#' @export
#'
#' @seealso
#'  * [pulse_heart()], [pulse_doublecheck()], [pulse_choose_keep()], and [pulse_normalize()] are the functions that generate the input for `pulse_summarise`
#'  * [pulse_plot()] can be called to visualize the output from `pulse_summarise` (but not [pulse_plot_raw()])
#'  * [PULSE()] is a wrapper function that executes all the steps needed to process PULSE data at once, and its output can also be passed on to `pulse_summarise`
#'
#' @examples
#' ## Begin prepare data ----
#' paths <- pulse_example()
#' heart_rates <- PULSE(
#'   paths,
#'   discard_channels = c(paste0("c0", c(1:7, 9)), "c10"),
#'   show_progress = FALSE
#'   )
#' ## End prepare data ----
#'
#' # Summarise heartbeat estimates (1 data point every 5 mins)
#' nrow(heart_rates) # == 13
#' summarised_5mins <- pulse_summarise(heart_rates, span_mins = 5)
#' nrow(summarised_5mins) # == 3
#' summarised_5mins
#'
#' # using a custom function
#' pulse_summarise(heart_rates, span_mins = 5, FUN = function(x) quantile(x, 0.2))
#'
#' # normalized data is supported automatically
#' pulse_summarise(pulse_normalize(heart_rates))
#'
#' # Note that visualizing the output from 'plot_summarise()' with
#' #  'pulse_plot()' may result in many warnings
#' pulse_plot(summarised_5mins)
#' "> There were 44 warnings (use warnings() to see them)"
#'
#' # That happens when the value chosen for 'span_mins' is such
#' #  that the output from 'plot_summarise()' doesn't contain
#' #  enough data points for the smoothing curve to be computed
#' # Alternatively, do one of the following:
#'
#' # reduce 'span_mins' to still get enough data points
#' pulse_plot(pulse_summarise(heart_rates, span_mins = 2, min_data_points = 0))
#'
#' # or disable the smoothing curve
#' pulse_plot(summarised_5mins, smooth = FALSE)
pulse_summarise <- function(heart_rates, FUN = stats::median, span_mins = 10, min_data_points = 2) {
	# make whole (just in case)
	span_mins <- floor(span_mins)

	has.norm <- heart_rates %>% colnames() %>% stringr::str_detect("hz_norm") %>% any()

	# summarise
	heart_rates <- heart_rates %>%
		dplyr::group_by(
			id,
			time = lubridate::floor_date(
				time,
				stringr::str_c(span_mins, " mins")
			)
		) %>%
		dplyr::summarise(
			i  = min(i),
			n  = dplyr::n(),
			sd = stats::sd(hz),
			hz = FUN(hz),
			hz_norm = (if (has.norm) {FUN(hz_norm)} else {NA}),
			ci = sd * 1.96,
			.groups = "drop"
		)

	# tidy
	heart_rates <- heart_rates %>%
		dplyr::filter(n > min_data_points) %>%
		dplyr::relocate(i, id, time, hz, hz_norm) %>%
		dplyr::mutate(i = factor(i) %>% as.numeric()) %>%
		dplyr::arrange(i)
	if (!has.norm) heart_rates <- dplyr::select(heart_rates, -hz_norm)

	# output
	heart_rates
}

