% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/udpipe_parse.R
\name{udpipe_annotate}
\alias{udpipe_annotate}
\title{Tokenise, Tag and Dependency Parsing Annotation of raw text}
\usage{
udpipe_annotate(object, x, doc_id = paste("doc", seq_along(x), sep = ""),
  tokenizer = "tokenizer", tagger = c("default", "none"),
  parser = c("default", "none"), ...)
}
\arguments{
\item{object}{an object of class \code{udpipe_model} as returned by \code{\link{udpipe_load_model}}}

\item{x}{a character vector in UTF-8 encoding where each element of the character vector 
contains text which you like to tokenize, tag and perform dependency parsing.}

\item{doc_id}{an identifier of a document with the same length as \code{x}. This should be a character vector.
\code{doc_id[i]} corresponds to \code{x[i]}.}

\item{tokenizer}{a character string of length 1, which is either 'tokenizer' (default udpipe tokenisation)
or a character string with more complex tokenisation options 
as specified in \url{http://ufal.mff.cuni.cz/udpipe/users-manual} in which case \code{tokenizer} should be a character string where the options
are put after each other using the semicolon as separation.}

\item{tagger}{a character string of length 1, which is either 'default' (default udpipe POS tagging and lemmatisation)
or 'none' (no POS tagging and lemmatisation needed) or a character string with more complex tagging options 
as specified in \url{http://ufal.mff.cuni.cz/udpipe/users-manual} in which case \code{tagger} should be a character string where the options
are put after each other using the semicolon as separation.}

\item{parser}{a character string of length 1, which is either 'default' (default udpipe dependency parsing) or
'none' (no dependency parsing needed) or a character string with more complex parsing options 
as specified in \url{http://ufal.mff.cuni.cz/udpipe/users-manual} in which case \code{parser} should be a character string where the options
are put after each other using the semicolon as separation.}

\item{...}{currently not used}
}
\value{
a list with 3 elements
\itemize{
 \item{x: }{The \code{x} character vector with text.}
 \item{conllu: }{A character vector of length 1 containing the annotated result of the annotation flow in CONLL-U format.
 This format is explained at \url{http://universaldependencies.org/format.html}}
 \item{error: }{A vector with the same length of \code{x} containing possible errors when annotating \code{x}}
}
}
\description{
Tokenise, Tag and Dependency Parsing Annotation of raw text
}
\examples{
x <- udpipe_download_model(language = "dutch-lassysmall")
ud_dutch <- udpipe_load_model(x$file_model)

## Tokenise, Tag and Dependency Parsing Annotation. Output is in CONLL-U format.
txt <- c("Dus. Godvermehoeren met pus in alle puisten, 
  zei die schele van Van Bukburg en hij had nog gelijk ook. 
  Er was toen dat liedje van tietenkonttieten kont tieten kontkontkont, 
  maar dat hoefden we geenseens niet te zingen. 
  Je kunt zeggen wat je wil van al die gesluierde poezenpas maar d'r kwam wel 
  een vleeswarenwinkel onder te voorschijn van heb je me daar nou.
  
  En zo gaat het maar door.",
  "Wat die ransaap van een academici nou weer in z'n botte pan heb gehaald mag 
  Joost in m'n schoen gooien, maar feit staat boven water dat het een gore 
  vieze vuile ransaap is.")
x <- udpipe_annotate(ud_dutch, x = txt)
cat(x$conllu)
as.data.frame(x)

## Only tokenisation
x <- udpipe_annotate(ud_dutch, x = txt, tagger = "none", parser = "none")
as.data.frame(x)

## Only tokenisation and POS tagging + lemmatisation, no dependency parsing
x <- udpipe_annotate(ud_dutch, x = txt, tagger = "default", parser = "none")
as.data.frame(x)

## Only tokenisation and dependency parsing, no POS tagging nor lemmatisation
x <- udpipe_annotate(ud_dutch, x = txt, tagger = "none", parser = "default")
as.data.frame(x)

## Provide doc_id for joining and identification purpose
x <- udpipe_annotate(ud_dutch, x = txt, doc_id = c("id1", "feedbackabc"),
                     tagger = "none", parser = "none")
as.data.frame(x)

## Mark on encodings: if your data is not in UTF-8 encoding, make sure you convert it to UTF-8 
## This can be done using iconv as follows for example
udpipe_annotate(ud_dutch, x = iconv('Ik drink melk bij mijn koffie.', to = "UTF-8"))

## cleanup for CRAN only - you probably want to keep your model if you have downloaded it
file.remove("dutch-lassysmall-ud-2.0-170801.udpipe")
}
\references{
\url{https://ufal.mff.cuni.cz/udpipe}, \url{https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-2364}, 
\url{http://universaldependencies.org/format.html}
}
\seealso{
\code{\link{udpipe_load_model}}, \code{\link{as.data.frame.udpipe_connlu}}
}
