% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/nfunctions.R
\name{ntoken}
\alias{ntoken}
\alias{ntype}
\title{count the number of tokens or types}
\usage{
ntoken(x, ...)

ntype(x, ...)
}
\arguments{
\item{x}{a \pkg{quanteda} object: a character, \link{corpus}, or
\link{tokens} object}

\item{...}{additional arguments passed to \code{\link{tokens}}}
}
\value{
count of the total tokens or types
}
\description{
Get the count of tokens (total features) or types (unique tokens)
in a text, corpus, or dfm.
}
\details{
The precise definition of "tokens" for objects not yet tokenized (e.g.
\link{character} or \link{corpus} objects can be controlled through optional
arguments passed to \code{\link{tokens}} through \code{...}.

For \link{dfm} objects, \code{ntype} will only return the count of features
that occur more than zero times in the dfm.
}
\note{
Due to differences between raw text tokens and features that have been 
  defined for a \link{dfm}, the counts be different for dfm objects and the 
  texts from which the dfm was generated.  Because the method tokenizes the 
  text in order to count the tokens, your results will depend on the options 
  passed through to \code{\link{tokenize}}
}
\examples{
# simple example
txt <- c(text1 = "This is a sentence, this.", text2 = "A word. Repeated repeated.")
ntoken(txt)
ntype(txt)
ntoken(toLower(txt))  # same
ntype(toLower(txt))   # fewer types
ntoken(toLower(txt), removePunct = TRUE)
ntype(toLower(txt), removePunct = TRUE)

# with some real texts
ntoken(corpus_subset(data_corpus_inaugural, Year<1806), removePunct = TRUE)
ntype(corpus_subset(data_corpus_inaugural, Year<1806), removePunct = TRUE)
ntoken(dfm(corpus_subset(data_corpus_inaugural, Year<1800)))
ntype(dfm(corpus_subset(data_corpus_inaugural, Year<1800)))
}

