% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/corpus.R
\name{corpus}
\alias{+.corpus}
\alias{[.corpus}
\alias{corpus}
\alias{corpus.VCorpus}
\alias{corpus.character}
\alias{corpus.corpusSource}
\alias{is.corpus}
\title{constructor for corpus objects}
\usage{
corpus(x, ...)

\method{corpus}{character}(x, enc = NULL, encTo = "UTF-8",
  docnames = NULL, docvars = NULL, source = NULL, notes = NULL,
  citation = NULL, ...)

\method{corpus}{corpusSource}(x, ...)

\method{corpus}{VCorpus}(x, ...)

is.corpus(x)

\method{+}{corpus}(c1, c2)

\method{[}{corpus}(x, i, j = NULL, ..., drop = TRUE)
}
\arguments{
\item{x}{a source of texts to form the documents in the corpus, a character 
vector or a \link{corpusSource-class} object created using 
\code{\link{textfile}}.}

\item{...}{additional arguments}

\item{enc}{a string specifying the input encoding for texts in the corpus. 
Must be a valid entry in \code{\link[stringi]{stri_enc_list}()}, since 
the code in \code{corpus.character} will convert this to \code{encTo} using
\code{\link[stringi]{stri_encode}}.  We recommend that you do
\strong{not} use \code{enc}, since if left \code{NULL} (the default) then
\code{corpus()} will detect the input encoding(s) and convert
automatically.

Currently only one input encoding can be specified for a collection of 
input texts, meaning that you should not mix input text encoding types in a
single \code{corpus} call.  However if you suspect multiple encodings, omit
the \code{enc} argument and \code{corpus()} will detect and convert each
file automatically.}

\item{encTo}{target encoding, default is UTF-8.  Unless you have strong reasons
to use an alternative encoding, we strongly recommend you leave this at its 
default.  Must be a valid entry in \code{\link[stringi]{stri_enc_list}()}}

\item{docnames}{Names to be assigned to the texts, defaults to the names of 
the character vector (if any), otherwise assigns "text1", "text2", etc.}

\item{docvars}{A data frame of attributes that is associated with each text.}

\item{source}{A string specifying the source of the texts, used for 
referencing.}

\item{notes}{A string containing notes about who created the text, warnings, 
To Dos, etc.}

\item{citation}{Information on how to cite the corpus.}

\item{c1}{corpus one to be added}

\item{c2}{corpus two to be added}

\item{i}{index for documents or rows of document variables}

\item{j}{index for column of document variables}

\item{drop}{if \code{TRUE} the result is coerced to the lowest possible
dimension (see the examples). This only works for extracting elements, not
for the replacement. See \code{\link{drop}} for further details.}
}
\value{
A corpus class object containing the original texts, document-level 
  variables, document-level metadata, corpus-level metadata, and default 
  settings for subsequent processing of the corpus.  A corpus consists of a 
  list of elements described below, although these should only be accessed 
  through accessor and replacement functions, not directly (since the 
  internals may be subject to change).  The structure of a corpus classed 
  list object is:
  
  \item{$documents}{A data frame containing the document level information, 
  consisting of \code{\link{texts}}, user-named \code{\link{docvars}} 
  variables describing attributes of the documents, and \code{metadoc} 
  document-level metadata whose names begin with an underscore character, 
  such as \code{_language}.}
  
  \item{$metadata}{A named list set of corpus-level meta-data, including 
  \code{source} and \code{created} (both generated automatically unless 
  assigned), \code{notes}, and \code{citation}.}
  
  \item{$settings}{Settings for the corpus which record options that govern 
  the subsequent processing of the corpus when it is converted into a 
  document-feature matrix (\link{dfm}).  See \link{settings}.}
  
  \item{$tokens}{An indexed list of tokens and types tabulated by document, 
  including information on positions.  Not yet fully implemented.}

\code{is.corpus} returns \code{TRUE} if the object is a corpus
}
\description{
Creates a corpus from a document source.  The current available document 
sources are: \itemize{ \item a character vector (as in R class \code{char}) 
of texts; \item a \link{corpusSource-class} object, constructed using 
\code{\link{textfile}}; \item a \pkg{tm} \link[tm]{VCorpus} class corpus 
object, meaning that anything you can use to create a \pkg{tm} corpus, 
including all of the tm plugins plus the built-in functions of tm for 
importing pdf, Word, and XML documents, can be used to create a quanteda 
\link{corpus}. } Corpus-level meta-data can be specified at creation, 
containing (for example) citation information and notes, as can 
document-level variables and document-level meta-data.
}
\details{
The texts and document variables of corpus objects can also be 
  accessed using index notation. Indexing a corpus object as a vector will 
  return its text, equivalent to \code{texts(x)}.  Indexing a corpus using
  two indexes (integers or column names) will return the document variables,
  equivalent to \code{docvars(x)}.

The \code{+} operator for a corpus object will combine two corpus 
  objects, resolving any non-matching \code{\link{docvars}} or 
  \code{\link{metadoc}} fields by making them into \code{NA} values for the 
  corpus lacking that field.  Corpus-level meta data is concatenated, except 
  for \code{source} and \code{notes}, which are stamped with information 
  pertaining to the creation of the new joined corpus.
  
  There are some issues that need to be addressed in future revisions of 
  quanteda concerning the use of factors to store document variables and 
  meta-data.  Currently most or all of these are not recorded as factors, 
  because we use \code{stringsAsFactors=FALSE} in the 
  \code{\link{data.frame}} calls that are used to create and store the 
  document-level information, because the texts should always be stored as
  character vectors and never as factors.
}
\note{
When \code{x} is a \link[tm]{VCorpus} object, the fixed metadata 
  fields from that object are imported as document-level metadata. Currently
  no corpus-level metadata is imported, but we will add that soon.
}
\examples{
# create a corpus from texts
corpus(inaugTexts)

# create a corpus from texts and assign meta-data and document variables
ukimmigCorpus <- corpus(ukimmigTexts, 
                        docvars = data.frame(party=names(ukimmigTexts)), 
                        encTo = "UTF-16") 

corpus(texts(ie2010Corpus))

\dontrun{# the fifth column of this csv file is the text field
mytexts <- textfile("http://www.kenbenoit.net/files/text_example.csv", textField = 5)
mycorp <- corpus(mytexts)
mycorp2 <- corpus(textfile("http://www.kenbenoit.net/files/text_example.csv", textField = "Title"))
identical(texts(mycorp), texts(mycorp2))
identical(docvars(mycorp), docvars(mycorp2))
}
# import a tm VCorpus
if ("tm" \%in\% rownames(installed.packages())) {
    data(crude, package = "tm")    # load in a tm example VCorpus
    mytmCorpus <- corpus(crude)
    summary(mytmCorpus, showmeta=TRUE)
    
    data(acq, package = "tm")
    summary(corpus(acq), 5, showmeta=TRUE)
    
    tmCorp <- tm::VCorpus(tm::VectorSource(inaugTexts[49:57]))
    quantCorp <- corpus(tmCorp)
    summary(quantCorp)
}
}
\author{
Kenneth Benoit and Paul Nulty
}
\seealso{
\link{docvars}, \link{metadoc}, \link{metacorpus}, \link{settings}, 
  \link{texts}
}

