% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/kgram_freqs.R
\name{kgram_freqs}
\alias{kgram_freqs}
\alias{kgram_freqs.numeric}
\alias{kgram_freqs.kgram_freqs}
\alias{kgram_freqs.character}
\alias{kgram_freqs.connection}
\alias{process_sentences}
\alias{process_sentences.character}
\alias{process_sentences.connection}
\title{k-gram Frequency Tables}
\usage{
kgram_freqs(object, ...)

\method{kgram_freqs}{numeric}(
  object,
  .preprocess = identity,
  .tknz_sent = identity,
  dict = NULL,
  ...
)

\method{kgram_freqs}{kgram_freqs}(object, ...)

\method{kgram_freqs}{character}(
  object,
  N,
  .preprocess = identity,
  .tknz_sent = identity,
  dict = NULL,
  open_dict = is.null(dict),
  verbose = FALSE,
  ...
)

\method{kgram_freqs}{connection}(
  object,
  N,
  .preprocess = identity,
  .tknz_sent = identity,
  dict = NULL,
  open_dict = is.null(dict),
  verbose = FALSE,
  max_lines = Inf,
  batch_size = max_lines,
  ...
)

process_sentences(
  text,
  freqs,
  .preprocess = attr(freqs, ".preprocess"),
  .tknz_sent = attr(freqs, ".tknz_sent"),
  open_dict = TRUE,
  in_place = TRUE,
  verbose = FALSE,
  ...
)

\method{process_sentences}{character}(
  text,
  freqs,
  .preprocess = attr(freqs, ".preprocess"),
  .tknz_sent = attr(freqs, ".tknz_sent"),
  open_dict = TRUE,
  in_place = TRUE,
  verbose = FALSE,
  ...
)

\method{process_sentences}{connection}(
  text,
  freqs,
  .preprocess = attr(freqs, ".preprocess"),
  .tknz_sent = attr(freqs, ".tknz_sent"),
  open_dict = TRUE,
  in_place = TRUE,
  verbose = FALSE,
  max_lines = Inf,
  batch_size = max_lines,
  ...
)
}
\arguments{
\item{object}{any type allowed by the available methods. The type defines the
behaviour of \code{kgram_freqs()} as a default constructor, a copy
constructor or a constructor of a non-trivial object. See ‘Details’.}

\item{...}{further arguments passed to or from other methods.}

\item{.preprocess}{a function taking a character vector as input and returning
a character vector as output. Optional preprocessing transformation
applied to text before k-gram tokenization. See  ‘Details’.}

\item{.tknz_sent}{a function taking a character vector as input and
returning a character vector as output. Optional sentence tokenization step
applied to text after preprocessing and before k-gram tokenization. See
‘Details’.}

\item{dict}{anything coercible to class
\link[kgrams]{dictionary}. Optional pre-specified word dictionary.}

\item{N}{a length one integer. Maximum order of k-grams to be considered.}

\item{open_dict}{\code{TRUE} or \code{FALSE}. If \code{TRUE}, any new
word encountered during processing not appearing in the original dictionary
is included into the dictionary. Otherwise, new words are replaced by an
unknown word token. It is by default \code{TRUE} if \code{dict} is
specified, \code{FALSE} otherwise.}

\item{verbose}{Print current progress to the console.}

\item{max_lines}{a length one positive integer or \code{Inf}.
Maximum number of lines to be read from the \code{connection}.
If \code{Inf}, keeps reading until the End-Of-File.}

\item{batch_size}{a length one positive integer less than or equal to
\code{max_lines}.Size of text batches when reading text from
\code{connection}.}

\item{text}{a character vector or a connection. Source of text from which
k-gram frequencies are to be extracted.}

\item{freqs}{a \code{kgram_freqs} object, to which new k-gram counts from
\code{text} are to be added.}

\item{in_place}{\code{TRUE} or \code{FALSE}. Should the initial
\code{kgram_freqs} object be modified in place?}
}
\value{
A \code{kgram_freqs} class object: k-gram frequency table storing
k-gram counts from text. For \code{process_sentences()}, the updated
\code{kgram_freqs} object is returned invisibly if \code{in_place} is
\code{TRUE}, visibly otherwise.
}
\description{
Extract k-gram frequency counts from a text or a connection.
\subsection{Principal methods supported by objects of class \code{kgram_freqs}}{
\itemize{
\item \code{query()}: query k-gram counts from the table.
See \link[kgrams]{query}
\item \code{probability()}: compute word continuation and sentence probabilities
using Maximum Likelihood estimates. See \link[kgrams]{probability}.
\item \code{language_model()}: build a k-gram language model using various
probability smoothing techniques. See \link[kgrams]{language_model}.
}
}
}
\details{
The function \code{kgram_freqs()} is a generic constructor for
objects of class \code{kgram_freqs}, i.e. k-gram frequency tables. The
constructor from \code{integer} returns an empty 'kgram_freqs' of fixed
order, with an optional
predefined dictionary (which can be empty) and \code{.preprocess} and
\code{.tknz_sent} functions to be used as defaults in other \code{kgram_freqs}
methods. The constructor from \code{kgram_freqs} returns a copy of an
existing object, and it is provided because, in general, \code{kgram_freqs}
objects have reference semantics, as discussed below.

The following discussion focuses on \code{process_sentences()} generic, as
well as on the \code{character} and \code{connection} methods of the
constructor \code{kgram_freqs()}. These functions extract k-gram
frequency counts from a text source, which may be either a character vector
or a connection. The second option is useful if one wants to avoid loading
the full text corpus in physical memory, allowing to process text from
different sources such as files, compressed files or URLs.

The returned object is of class \code{kgram_freqs} (a thin wrapper
around the internal C++ class where all k-gram computations take place).
\code{kgram_freqs} objects have methods for querying bare k-gram frequencies
(\link[kgrams]{query}) and maximum likelihood estimates of sentence
probabilities or word continuation probabilities
(see \link[kgrams]{probability})) . More importantly
\code{kgram_freqs} objects are used to create \link[kgrams]{language_model}
objects, which support various probability smoothing techniques.

The function \code{kgram_freqs()} is used to \emph{construct} a new
\code{kgram_freqs} object, initializing it with the k-gram counts from
the \code{text} input, whereas \code{process_sentences()} is used to
add k-gram counts from a new \code{text} to an \emph{existing}
\code{kgram_freqs} object, \code{freqs}. In this second case, the initial
object \code{freqs} can either be modified in place
(for \code{in_place == TRUE}, the default) or by making a copy
(\code{in_place == FALSE}), see the examples below.
The final object is returned invisibly when modifying in place,
visibly in the second case. It is worth to mention that modifying in place
a \code{kgram_freqs} object \code{freqs} will also affect
\code{language_model} objects created from \code{freqs} with
\code{language_model()}, which will also be updated with the new information.
If one wants to avoid this behaviour, one can make copies using either the
\code{kgram_freqs()} copy constructor, or the \code{in_place = FALSE}
argument.

The \code{dict} argument allows to provide an initial set of known
words. Subsequently, one can either work with such a closed dictionary
(\code{open_dict == FALSE}), or extended the dictionary with all
new words encountered during k-gram processing
(\code{open_dict == TRUE})  .

The \code{.preprocess} and \code{.tknz_sent} functions are applied
\emph{before} k-gram counting takes place, and are in principle
arbitrary transformations of the original text.
\emph{After} preprocessing and sentence tokenization, each line of the
transformed input is presented to the k-gram counting algorithm as a separate
sentence (these sentences are implicitly padded
with \code{N - 1} Begin-Of-Sentence (BOS) and one End-Of-Sentence (EOS)
tokens, respectively. This is illustrated in the examples). For basic
usage, this package offers the utilities \link[kgrams]{preprocess} and
\link[kgrams]{tknz_sent}. Notice that, strictly speaking, there is
some redundancy in these two arguments, as the processed input to the k-gram
counting algorithm is \code{.tknz_sent(.preprocess(text))}.
They appear explicitly as separate arguments for two main reasons:
\itemize{
\item The presence of \code{.tknz_sent} is a reminder of the
fact that sentences have to be explicitly separeted in different entries
of the processed input, in order for \code{kgram_freqs()} to append the
correct Begin-Of-Sentence and End-Of-Sentence paddings to each sentence.
\item At prediction time (e.g. with \link[kgrams]{probability}), by default only
\code{.preprocess} is applied when computing conditional probabilities,
whereas both \code{.preprocess()} and \code{.tknz_sent()} are
applied when computing sentence absolute probabilities.
}
}
\examples{
# Build a k-gram frequency table from a character vector

f <- kgram_freqs("a b b a a", 3)
f
summary(f)
query(f, c("a", "b")) # c(3, 2)
query(f, c("a b", "a" \%+\% EOS(), BOS() \%+\% "a b")) # c(1, 1, 1)
query(f, "a b b a") # NA (counts for k-grams of order k > 3 are not known)

process_sentences("b", f)
query(f, c("a", "b")) # c(3, 3): 'f' is updated in place

f1 <- process_sentences("b", f, in_place = FALSE)
query(f, c("a", "b")) # c(3, 3): 'f' is copied
query(f1, c("a", "b")) # c(3, 4): the new 'f1' stores the updated counts




# Build a k-gram frequency table from a file connection

\dontrun{
f <- kgram_freqs(file("myfile.txt"), 3)
}


# Build a k-gram frequency table from an URL connection
\dontrun{
### Shakespeare's "Much Ado About Nothing" (entire play)
con <- url("http://shakespeare.mit.edu/much_ado/full.html")

# Apply some basic preprocessing
.preprocess <- function(x) {
        # Remove character names and locations (boldfaced in original html)
        x <- gsub("<b>[A-z]+</b>", "", x)
        # Remove other html tags
        x <- gsub("<[^>]+>||<[^>]+$||^[^>]+>$", "", x)
        # Apply standard preprocessing including lower-case
        x <- kgrams::preprocess(x)
        return(x)
}

.tknz_sent <- function(x) {
        # Tokenize sentences keeping Shakespeare's punctuation
        x <- kgrams::tknz_sent(x, keep_first = TRUE)
        # Remove empty sentences
        x <- x[x != ""]
        return(x)
}

f <- kgram_freqs(con, 3, .preprocess, .tknz_sent, batch_size = 1000)
summary(f)

query(f, c("leonato", "thy", "smartphones")) # c(145, 52, 0)
}
}
\seealso{
\link[kgrams]{query}, \link[kgrams]{probability}
\link[kgrams]{language_model}, \link[kgrams]{dictionary}
}
\author{
Valerio Gherardi
}
