% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/getDescriptionStatsBy.R
\name{getDescriptionStatsBy}
\alias{getDescriptionStatsBy}
\title{Creating of description statistics}
\usage{
getDescriptionStatsBy(x, by, digits = 1, digits.nonzero = NA,
  html = TRUE, numbers_first = TRUE, statistics = FALSE,
  statistics.sig_lim = 10^-4, statistics.two_dec_lim = 10^-2,
  statistics.suppress_warnings = TRUE, useNA = c("ifany", "no",
  "always"), useNA.digits = digits, continuous_fn = describeMean,
  prop_fn = describeProp, factor_fn = describeFactors,
  show_all_values = FALSE, hrzl_prop = FALSE, add_total_col,
  total_col_show_perc = TRUE, use_units = FALSE, default_ref,
  NEJMstyle = FALSE, percentage_sign = TRUE, header_count,
  missing_value = "-", names_of_missing = NULL, ...)
}
\arguments{
\item{x}{The variable that you want the statistics for}

\item{by}{The variable that you want to split into different
columns}

\item{digits}{The number of decimals used}

\item{digits.nonzero}{The number of decimals used for values that are close to zero}

\item{html}{If HTML compatible output should be used. If \code{FALSE}
it outputs LaTeX formatting}

\item{numbers_first}{If the number should be given or if the percentage
should be presented first. The second is encapsulated in parentheses ().}

\item{statistics}{Add statistics, fisher test for proportions and Wilcoxon
for continuous variables. See details below for more customization.}

\item{statistics.sig_lim}{The significance limit for < sign, i.e. p-value 0.0000312
should be < 0.0001 with the default setting.}

\item{statistics.two_dec_lim}{The limit for showing two decimals. E.g.
the p-value may be 0.056 and we may want to keep the two decimals in order
to emphasize the proximity to the all-mighty 0.05 p-value and set this to
\eqn{10^-2}. This allows that a value of 0.0056 is rounded to 0.006 and this
makes intuitive sense as the 0.0056 level as this is well below
the 0.05 value and thus not as interesting to know the exact proximity to
0.05. \emph{Disclaimer:} The 0.05-limit is really silly and debated, unfortunately
it remains a standard and this package tries to adapt to the current standards in order
to limit publication associated issues.}

\item{statistics.suppress_warnings}{Hide warnings from the statistics function.}

\item{useNA}{This indicates if missing should be added as a separate
row below all other. See \code{\link[base]{table}} for \code{useNA}-options.
\emph{Note:} defaults to ifany and not "no" as \code{\link[base]{table}} does.}

\item{useNA.digits}{The number of digits to use for the
missing percentage, defaults to the overall \code{digits}.}

\item{continuous_fn}{The method to describe continuous variables. The
default is \code{\link{describeMean}}.}

\item{prop_fn}{The method used to describe proportions, see \code{\link{describeProp}}.}

\item{factor_fn}{The method used to describe factors, see \code{\link{describeFactors}}.}

\item{show_all_values}{This is by default false as for instance if there is
no missing and there is only one variable then it is most sane to only show
one option as the other one will just be a complement to the first. For instance
sex - if you know gender then automatically you know the distribution of the
other sex as it's 100 \% - other \%. To choose which one you want to show then
set the \code{default_ref} parameter.}

\item{hrzl_prop}{This is default FALSE and indicates
that the proportions are to be interpreted in a vertical manner.
If we want the data to be horizontal, i.e. the total should be shown
and then how these differ in the different groups then set this to TRUE.}

\item{add_total_col}{This adds a total column to the resulting table.
You can also specify if you want the total column "first" or "last"
in the column order.}

\item{total_col_show_perc}{This is by default true but if
requested the percentages are suppressed as this sometimes may be confusing.}

\item{use_units}{If the Hmisc package's units() function has been employed
it may be interesting to have a column at the far right that indicates the
unit measurement. If this column is specified then the total column will
appear before the units (if specified as last). You can also set the value to
\code{"name"} and the units will be added to the name as a parenthesis,
e.g. Age (years).}

\item{default_ref}{The default reference, either first,
the level name or a number within the levels. If left out
it defaults to the first value.}

\item{NEJMstyle}{Adds - no (\%) at the end to proportions}

\item{percentage_sign}{If you want to suppress the percentage sign you
can set this variable to FALSE. You can also choose something else that
the default \% if you so wish by setting this variable.}

\item{header_count}{Set to \code{TRUE} if you want to add a header count,
e.g. Smoking; No. 25 observations, where there is a new line after the
factor name. If you want a different text for the second line you can
specifically use the \code{\link[base]{sprintf}} formatting, e.g. "No. \%s patients".}

\item{missing_value}{Value that is substituted for empty cells. Defaults to "-"}

\item{names_of_missing}{Optional character vector containing the names of returned statistics,
in case all returned values for a given \code{by} level are missing. Defaults to NULL}

\item{...}{Currently only used for generating warnings of deprecated call
parameters.}
}
\value{
Returns a vector if vars wasn't specified and it's a
 continuous or binary statistic. If vars was a matrix then it
 appends the result to the end of that matrix. If the x variable
 is a factor then it does not append and you get a warning.
}
\description{
A function that returns a description statistic that can be used
for creating a publication "table 1" when you want it by groups.
The function identifies if the variable is a continuous, binary
or a factored variable. The format is inspired by NEJM, Lancet &
BMJ.
}
\section{Customizing statistics}{


You can specify what function that you want for statistic by providing a function
that takes two arguments \code{x} and \code{by} and returns a p-value. There are
a few functions already prepared for this see \code{\link{getPvalAnova}},
\code{\link{getPvalChiSq}}
\code{\link{getPvalFisher}}
\code{\link{getPvalKruskal}}
\code{\link{getPvalWilcox}}.
The default functions used are \code{getPvalFisher} and \code{getPvalWilcox} (unless the by
argument has more than three unique levels where it defaults to \code{getPvalAnova}).

If you want the function to select functions depending on the type of input
you can provide a list with the names \code{'continuous'}, \code{'proportion'}, \code{'factor'} and
the function will choose accordingly. If you fail to define a certain category
it will default to the above.

You can also use a custom function that returns a string with the attribute 'colname' set that will be appended
to the results instead of the p-value column.
to the results instead of the p-value column.
}

\examples{
data(mtcars)
# For labelling we use the label()
# function from the Hmisc package
library(Hmisc)

label(mtcars$mpg) <- "Gas"
units(mtcars$mpg) <- "Miles/(US) gallon"

label(mtcars$wt) <- "Weight"
units(mtcars$wt) <- "10<sup>3</sup> kg" # not sure the unit is correct

mtcars$am <- factor(mtcars$am, levels=0:1, labels=c("Automatic", "Manual"))
label(mtcars$am) <- "Transmission"

mtcars$gear <- factor(mtcars$gear)
label(mtcars$gear) <- "Gears"

# Make up some data for making it slightly more interesting
mtcars$col <- factor(sample(c("red", "black", "silver"),
                     size=NROW(mtcars), replace=TRUE))
label(mtcars$col) <- "Car color"

mergeDesc(getDescriptionStatsBy(mtcars$mpg, mtcars$am,
                                header_count = TRUE,
                                use_units = TRUE),
          getDescriptionStatsBy(mtcars$wt, mtcars$am,
                                header_count = TRUE,
                                use_units = TRUE),
          htmlTable_args = list(caption  = "Basic continuous stats from the mtcars dataset"))

tll <- list()
tll[["Gear (3 to 5)"]] <- getDescriptionStatsBy(mtcars$gear, mtcars$am)
tll <- c(tll,
         list(getDescriptionStatsBy(mtcars$col, mtcars$am)))

mergeDesc(tll,
          htmlTable_args = list(caption  = "Factored variables"))

tl_no_units <- list()
tl_no_units[["Gas (mile/gallons)"]] <-
  getDescriptionStatsBy(mtcars$mpg, mtcars$am,
                        header_count = TRUE)
tl_no_units[["Weight (10<sup>3</sup> kg)"]] <-
  getDescriptionStatsBy(mtcars$wt, mtcars$am,
                        header_count = TRUE)
mergeDesc(tl_no_units, tll,
          # Remove the formatting for the groups
          htmlTable_args = list(css.rgroup = ""))


# A little more advanced
mtcars$mpg[sample(1:NROW(mtcars), size=5)] <- NA
getDescriptionStatsBy(mtcars$mpg, mtcars$am, statistics=TRUE)

# Do the horizontal version
getDescriptionStatsBy(mtcars$col, mtcars$am,
                      statistics=TRUE, hrzl_prop = TRUE)

mtcars$wt_with_missing <- mtcars$wt
mtcars$wt_with_missing[sample(1:NROW(mtcars), size=8)] <- NA
getDescriptionStatsBy(mtcars$wt_with_missing, mtcars$am, statistics=TRUE,
                      hrzl_prop = TRUE, total_col_show_perc = FALSE)


mtcars$col_with_missing <- mtcars$col
mtcars$col_with_missing[sample(1:NROW(mtcars), size=5)] <- NA
getDescriptionStatsBy(mtcars$col_with_missing, mtcars$am, statistics=TRUE,
                      hrzl_prop = TRUE, total_col_show_perc = FALSE)


\dontrun{
  ## There is also a LaTeX wrapper
  tll <- list(
    getDescriptionStatsBy(mtcars$gear, mtcars$am),
    getDescriptionStatsBy(mtcars$col, mtcars$am))

  latex(mergeDesc(tll),
        caption  = "Factored variables",
        file="")
}
}
\seealso{
Other descriptive functions: \code{\link{describeFactors}},
  \code{\link{describeMean}}, \code{\link{describeMedian}},
  \code{\link{describeProp}}, \code{\link{getPvalWilcox}}
}
\concept{descriptive functions}
