% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/corpus.R
\name{corpus}
\alias{corpus}
\alias{corpus.corpus}
\alias{corpus.character}
\alias{corpus.data.frame}
\alias{corpus.kwic}
\alias{corpus.Corpus}
\title{Construct a corpus object}
\usage{
corpus(x, ...)

\method{corpus}{corpus}(x, docnames = quanteda::docnames(x),
  docvars = quanteda::docvars(x), metacorpus = quanteda::metacorpus(x),
  compress = FALSE, ...)

\method{corpus}{character}(x, docnames = NULL, docvars = NULL,
  metacorpus = NULL, compress = FALSE, ...)

\method{corpus}{data.frame}(x, docid_field = "doc_id",
  text_field = "text", metacorpus = NULL, compress = FALSE, ...)

\method{corpus}{kwic}(x, split_context = TRUE, extract_keyword = TRUE,
  ...)

\method{corpus}{Corpus}(x, metacorpus = NULL, compress = FALSE, ...)
}
\arguments{
\item{x}{a valid corpus source object}

\item{...}{not used directly}

\item{docnames}{Names to be assigned to the texts.  Defaults to the names of
the character vector (if any); \code{doc_id} for a data.frame; the document
names in a \pkg{tm} corpus; or a vector of user-supplied labels equal in
length to the number of documents.  If none of these are round, then
"text1", "text2", etc. are assigned automatically.}

\item{docvars}{a data.frame of document-level variables associated with each text}

\item{metacorpus}{a named list containing additional (character) information
  to be added to the corpus as corpus-level metadata.  Special fields
  recognized in the \code{\link{summary.corpus}} are:
\itemize{
\item{\code{source }}{a description of the source of the texts, used for
  referencing;}
\item{\code{citation }}{information on how to cite the corpus; and}
\item{\code{notes }}{any additional information about who created the text, warnings,
  to do lists, etc.}
}}

\item{compress}{logical; if \code{TRUE}, compress the texts in memory using
gzip compression. This significantly reduces the size of the corpus in
memory, but will slow down operations that require the texts to be
extracted.}

\item{docid_field}{optional column index of a document identifier; defaults
to "doc_id", but if this is not found, then will use the rownames of the
data.frame; if the rownames are not set, it will use the default sequence
based on \code{(\link{quanteda_options}("base_docname")}.}

\item{text_field}{the character name or numeric index of the source
\code{data.frame} indicating the variable to be read in as text, which must
be a character vector. All other variables in the data.frame will be
imported as docvars.  This argument is only used for \code{data.frame}
objects (including those created by \pkg{readtext}).}

\item{split_context}{logical; if \code{TRUE}, split each kwic row into two
"documents", one for "pre" and one for "post", with this designation saved
in a new docvar \code{context} and with the new number of documents
therefore being twice the number of rows in the kwic.}

\item{extract_keyword}{logical; if  \code{TRUE}, save the keyword matching
\code{pattern} as a new docvar \code{keyword}}
}
\value{
A \link{corpus-class} class object containing the original texts,
  document-level variables, document-level metadata, corpus-level metadata,
  and default settings for subsequent processing of the corpus.
}
\description{
Creates a corpus object from available sources.  The currently available
sources are:
\itemize{
\item a \link{character} vector, consisting of one document per element; if
  the elements are named, these names will be used as document names.
\item a \link{data.frame} (or a \pkg{tibble} \code{tbl_df}), whose default
document id is a variable identified by \code{docid_field}; the text of the
document is a variable identified by \code{textid_field}; and other variables
are imported as document-level meta-data.  This matches the format of
data.frames constructed by the the \pkg{readtext} package.
\item a \link{kwic} object constructed by \code{\link{kwic}}.
\item a \pkg{tm} \link[tm]{VCorpus} or \link[tm]{SimpleCorpus} class  object,
  with the fixed metadata
  fields imported as \link{docvars} and corpus-level metadata imported
  as \link{metacorpus} information.
\item a \link{corpus} object.
}
}
\details{
The texts and document variables of corpus objects can also be
  accessed using index notation. Indexing a corpus object as a vector will
  return its text, equivalent to \code{texts(x)}.  Note that this is not the
  same as subsetting the entire corpus -- this should be done using the
  \code{\link{subset}} method for a corpus.

  Indexing a corpus using two indexes (integers or column names) will return
  the document variables, equivalent to \code{docvars(x)}.  It is also
  possible to access, create, or replace docvars using list notation, e.g.

  \code{myCorpus[["newSerialDocvar"]] <-
  paste0("tag", 1:ndoc(myCorpus))}.

  For details, see \link{corpus-class}.
}
\section{A warning on accessing corpus elements}{
 A corpus currently consists
  of an S3 specially classed list of elements, but \strong{you should not
  access these elements directly}. Use the extractor and replacement
  functions instead, or else your code is not only going to be uglier, but
  also likely to break should the internal structure of a corpus object
  change (as it inevitably will as we continue to develop the package,
  including moving corpus objects to the S4 class system).
}

\examples{
# create a corpus from texts
corpus(data_char_ukimmig2010)

# create a corpus from texts and assign meta-data and document variables
summary(corpus(data_char_ukimmig2010,
               docvars = data.frame(party = names(data_char_ukimmig2010))), 5)

corpus(texts(data_corpus_irishbudget2010))

# import a tm VCorpus
if (requireNamespace("tm", quietly = TRUE)) {
    data(crude, package = "tm")    # load in a tm example VCorpus
    mytmCorpus <- corpus(crude)
    summary(mytmCorpus, showmeta=TRUE)

    data(acq, package = "tm")
    summary(corpus(acq), 5, showmeta=TRUE)

    tmCorp <- tm::VCorpus(tm::VectorSource(data_char_ukimmig2010))
    quantCorp <- corpus(tmCorp)
    summary(quantCorp)
}

# construct a corpus from a data.frame
mydf <- data.frame(letter_factor = factor(rep(letters[1:3], each = 2)),
                  some_ints = 1L:6L,
                  some_text = paste0("This is text number ", 1:6, "."),
                  stringsAsFactors = FALSE,
                  row.names = paste0("fromDf_", 1:6))
mydf
summary(corpus(mydf, text_field = "some_text",
               metacorpus = list(source = "From a data.frame called mydf.")))

# construct a corpus from a kwic object
mykwic <- kwic(data_corpus_inaugural, "southern")
summary(corpus(mykwic))
# from a kwic
kw <- kwic(data_char_sampletext, "econom*")
summary(corpus(kw))
summary(corpus(kw, split_context = FALSE))
texts(corpus(kw, split_context = FALSE))

}
\seealso{
\link{corpus-class}, \code{\link{docvars}}, \code{\link{metadoc}},
  \code{\link{metacorpus}},
  \code{\link{settings}}, \code{\link{texts}}, \code{\link{ndoc}},
  \code{\link{docnames}}
}
\author{
Kenneth Benoit and Paul Nulty
}
\keyword{corpus}
