% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/read_transcript.R
\name{read_transcript}
\alias{read_transcript}
\title{Read Transcripts Into R}
\usage{
read_transcript(
  file,
  col.names = c("Person", "Dialogue"),
  text.var = NULL,
  merge.broke.tot = TRUE,
  header = FALSE,
  dash = "",
  ellipsis = "...",
  quote2bracket = FALSE,
  rm.empty.rows = TRUE,
  na = "",
  sep = NULL,
  skip = 0,
  text,
  comment.char = "",
  max.person.nchar = 20,
  ...
)
}
\arguments{
\item{file}{The name of the file which the data are to be read from. Each row
of the table appears as one line of the file. If it does not contain an
absolute path, the file name is relative to the current working directory,
\code{\link[base:getwd]{base::getwd()}}.}

\item{col.names}{A character vector specifying the column names of the
transcript columns.}

\item{text.var}{A character string specifying the name of the text variable
will ensure that variable is classed as character.  If \code{NULL}
\code{\link[=read_transcript]{read_transcript()}} attempts to guess the text.variable
(dialogue).}

\item{merge.broke.tot}{logical.  If \code{TRUE} and if the file being read in
is .docx with broken space between a single turn of talk read_transcript
will attempt to merge these into a single turn of talk.}

\item{header}{logical.  If \code{TRUE} the file contains the names of the
variables as its first line.}

\item{dash}{A character string to replace the en and em dashes special
characters (default is to remove).}

\item{ellipsis}{A character string to replace the ellipsis special characters.}

\item{quote2bracket}{logical. If \code{TRUE} replaces curly quotes with curly
braces (default is \code{FALSE}).  If \code{FALSE} curly quotes are removed.}

\item{rm.empty.rows}{logical.  If \code{TRUE}
\code{\link[=read_transcript]{read_transcript()}}  attempts to remove empty rows.}

\item{na}{A character string to be interpreted as an \code{NA} value.}

\item{sep}{The field separator character. Values on each line of the file are
separated by this character.  The default of \code{NULL} instructs
\code{\link[=read_transcript]{read_transcript()}} to use a separator suitable for the file
type being read in.}

\item{skip}{Integer; the number of lines of the data file to skip before
beginning to read data.}

\item{text}{Character string: if file is not supplied and this is, then data
are read from the value of text. Notice that a literal string can be used to
include (small) data sets within R code.}

\item{comment.char}{A character vector of length one containing a single
character or an empty string. Use \code{""} to turn off the interpretation of
comments altogether.}

\item{max.person.nchar}{The max number of characters long names are expected
to be.  This information is used to warn the user if a separator appears beyond
this length in the text.}

\item{...}{Further arguments to be passed to \code{\link[utils:read.table]{utils::read.table()}},
\code{\link[readxl:read_excel]{readxl::read_excel()}}, or \code{\link[=read_doc]{read_doc()}}.}
}
\value{
Returns a dataframe of dialogue and people.
}
\description{
Read .docx, .doc, .rtf, .csv, .xlsx, .xlsx, or .txt transcript style files into R.
}
\note{
If a transcript is a .docx file read_transcript expects two columns
(generally person and dialogue) with some sort of separator (default is colon
separator).  .doc files must be converted to .docx before reading in.
}
\section{Warning}{
 \code{\link[=read_transcript]{read_transcript()}} may contain errors if the
file being read in is .docx.  The researcher should carefully investigate
each transcript for errors before further parsing the data.
}

\examples{
(doc1 <- system.file("docs/trans1.docx", package = "textreadr"))
(doc2 <- system.file("docs/trans2.docx", package = "textreadr"))
(doc3 <- system.file("docs/trans3.docx", package = "textreadr"))
(doc4 <- system.file("docs/trans4.xlsx", package = "textreadr"))
(doc5 <- system.file("docs/trans5.xls", package = "textreadr"))
(doc6 <- system.file("docs/trans6.doc", package = "textreadr"))
##(doc7 <- system.file("docs/trans7.rtf", package = "textreadr"))
(doc8 <- system.file("docs/trans8.odt", package = "textreadr"))

dat1 <- read_transcript(doc1)
dat2 <- read_transcript(doc1, col.names = c("person", "dialogue"))

## read_transcript(doc2) #throws an error (need skip)
dat3 <- read_transcript(doc2, skip = 1)

## read_transcript(doc3, skip = 1) #incorrect read; wrong sep
dat4 <- read_transcript(doc3, sep = "-", skip = 1)

## xlsx/xls format
dat5 <- read_transcript(doc4)
dat6 <- read_transcript(doc5)

## MS doc format
\dontrun{
dat6b <- read_transcript(doc6) ## need to skip Researcher
dat6c <- read_transcript(doc6, skip = 1)
}

## rtf format
\dontrun{
rtf_doc <- download(
    'https://raw.githubusercontent.com/trinker/textreadr/master/inst/docs/trans7.rtf'
)
dat9 <- read_transcript(rtf_doc, skip = 1)
}

## odt format
read_transcript(doc8)

## text string input
trans <- "sam: Computer is fun. Not too fun.
greg: No it's not, it's dumb.
teacher: What should we do?
sam: You liar, it stinks!"

read_transcript(text=trans)

## Read in text specify spaces as sep
## EXAMPLE 1
read_transcript(text="34    The New York Times reports a lot of words here.
12    Greenwire reports a lot of words.
31    Only three words.
 2    The Financial Times reports a lot of words.
 9    Greenwire short.
13    The New York Times reports a lot of words again.",
    col.names = c("NO", "ARTICLE"), sep = "   ")

## EXAMPLE 2
read_transcript(text="34..    The New York Times reports a lot of words here.
12..    Greenwire reports a lot of words.
31..    Only three words.
 2..    The Financial Times reports a lot of words.
 9..    Greenwire short.
13..    The New York Times reports a lot of words again.",
    col.names = c("NO", "ARTICLE"), sep = "\\\\.\\\\.")

## Real Example
real_dat <- read_transcript(
    system.file("docs/Yasmine_Interview_Transcript.docx", package = "textreadr"),
    skip = 19
)
}
\references{
\url{https://github.com/trinker/qdap/wiki/Reading-.docx-\\\%5BMS-Word\\\%5D-Transcripts-into-R}
}
\author{
Bryan Goodrich and Tyler Rinker \href{mailto:tyler.rinker@gmail.com}{tyler.rinker@gmail.com}.
}
\keyword{transcript}
