% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/civis_ml.R
\name{civis_ml}
\alias{civis_ml}
\alias{civis_ml_fetch_existing}
\alias{predict.civis_ml}
\title{Interface for modeling in the Civis Platform}
\usage{
civis_ml(
  x,
  dependent_variable,
  model_type,
  primary_key = NULL,
  excluded_columns = NULL,
  parameters = NULL,
  fit_params = NULL,
  cross_validation_parameters = NULL,
  calibration = NULL,
  oos_scores_table = NULL,
  oos_scores_db = NULL,
  oos_scores_if_exists = c("fail", "append", "drop", "truncate"),
  model_name = NULL,
  cpu_requested = NULL,
  memory_requested = NULL,
  disk_requested = NULL,
  notifications = NULL,
  polling_interval = NULL,
  validation_data = c("train", "skip"),
  n_jobs = NULL,
  verbose = FALSE,
  civisml_version = "prod"
)

civis_ml_fetch_existing(model_id, run_id = NULL)

\method{predict}{civis_ml}(
  object,
  newdata,
  primary_key = NA,
  output_table = NULL,
  output_db = NULL,
  if_output_exists = c("fail", "append", "drop", "truncate"),
  n_jobs = NULL,
  cpu_requested = NULL,
  memory_requested = NULL,
  disk_requested = NULL,
  polling_interval = NULL,
  verbose = FALSE,
  dvs_to_predict = NULL,
  ...
)
}
\arguments{
\item{x, newdata}{See the Data Sources section below.}

\item{dependent_variable}{The dependent variable of the training dataset.
For a multi-target problem, this should be a vector of column names of
dependent variables. Nulls in a single dependent variable will
automatically be dropped.}

\item{model_type}{The name of the CivisML workflow. See the Workflows section
below.}

\item{primary_key}{Optional, the unique ID (primary key) of the training
dataset. This will be used to index the out-of-sample scores. In
\code{predict.civis_ml}, the primary_key of the training task is used by
default \code{primary_key = NA}. Use \code{primary_key = NULL} to
explicitly indicate the data have no primary_key.}

\item{excluded_columns}{Optional, a vector of columns which will be
considered ineligible to be independent variables.}

\item{parameters}{Optional, parameters for the final stage estimator in a
predefined model, e.g. \code{list(C = 2)} for a "sparse_logistic"
model.}

\item{fit_params}{Optional, a mapping from parameter names in the model's
\code{fit} method to the column names which hold the data, e.g.
\code{list(sample_weight = 'survey_weight_column')}.}

\item{cross_validation_parameters}{Optional, parameter grid for learner
parameters, e.g. \code{list(n_estimators = c(100, 200, 500),
learning_rate = c(0.01, 0.1), max_depth = c(2, 3))}
or \code{"hyperband"} for supported models.}

\item{calibration}{Optional, if not \code{NULL}, calibrate output
probabilities with the selected method, \code{sigmoid}, or \code{isotonic}.
Valid only with classification models.}

\item{oos_scores_table}{Optional, if provided, store out-of-sample
predictions on training set data to this Redshift "schema.tablename".}

\item{oos_scores_db}{Optional, the name of the database where the
\code{oos_scores_table} will be created. If not provided, this will default
to \code{database_name}.}

\item{oos_scores_if_exists}{Optional, action to take if
\code{oos_scores_table} already exists. One of \code{"fail"}, \code{"append"}, \code{"drop"}, or \code{"truncate"}.
The default is \code{"fail"}.}

\item{model_name}{Optional, the prefix of the Platform modeling jobs.
It will have \code{" Train"} or \code{" Predict"} added to become the Script title.}

\item{cpu_requested}{Optional, the number of CPU shares requested in the
Civis Platform for training jobs or prediction child jobs.
1024 shares = 1 CPU.}

\item{memory_requested}{Optional, the memory requested from Civis Platform
for training jobs or prediction child jobs, in MiB.}

\item{disk_requested}{Optional, the disk space requested on Civis Platform
for training jobs or prediction child jobs, in GB.}

\item{notifications}{Optional, model status notifications. See
\code{\link{scripts_post_custom}} for further documentation about email
and URL notification.}

\item{polling_interval}{Check for job completion every this number of seconds.}

\item{validation_data}{Optional, source for validation data. There are
currently two options: \code{train} (the default), which uses training
data for validation, and \code{skip}, which skips the validation step.}

\item{n_jobs}{Number of concurrent Platform jobs to use for training and
validation, or multi-file / large table prediction. Defaults to
\code{NULL}, which allows CivisML to dynamically calculate an
appropriate number of workers to use (in general, as many as
possible without using all resources in the cluster).}

\item{verbose}{Optional, If \code{TRUE}, supply debug outputs in Platform
logs and make prediction child jobs visible.}

\item{civisml_version}{Optional, a one-length character vector of the
CivisML version. The default is "prod", the latest version in production}

\item{model_id}{The \code{id} of CivisML model built previously.}

\item{run_id}{Optional, the \code{id} of a CivisML model run. If \code{NULL},
defaults to fetching the latest run.}

\item{object}{A \code{civis_ml} object.}

\item{output_table}{The table in which to put predictions.}

\item{output_db}{The database containing \code{output_table}. If not
provided, this will default to the \code{database_name} specified when
the model was built.}

\item{if_output_exists}{Action to take if the prediction table already exists. One of \code{"fail"}, \code{"append"}, \code{"drop"}, or \code{"truncate"}.
The default is \code{"fail"}.}

\item{dvs_to_predict}{Optional, For scoring, this should be a vector of column
names of dependent variables to include in the output table. It must be a
subset of the \code{dependent_variable} vector provided for training.
The scores for the returned subset will be identical to the scores which
those outputs would have had if all outputs were written, but ignoring some
of the model's outputs will let predictions complete faster and use less disk space.
If not provided, the entire model output will be written to the output table.}

\item{\dots}{Unused}
}
\value{
A \code{civis_ml} object, a list containing the following elements:
\item{job}{job metadata from \code{\link{scripts_get_custom}}.}
\item{run}{run metadata from \code{\link{scripts_get_custom_runs}}.}
\item{outputs}{CivisML metadata from \code{\link{scripts_list_custom_runs_outputs}} containing the locations of
 files produced by CivisML e.g. files, projects, metrics, model_info, logs, predictions, and estimators.}
\item{metrics}{Parsed CivisML output from \code{metrics.json} containing metadata from validation.
 A list containing the following elements:
  \itemize{
  \item run list, metadata about the run.
  \item data list, metadata about the training data.
  \item model list, the fitted scikit-learn model with CV results.
  \item metrics list, validation metrics (accuracy, confusion, ROC, AUC, etc).
  \item warnings list.
  \item data_platform list, training data location.
}}
\item{model_info}{Parsed CivisML output from \code{model_info.json} containing metadata from training.
 A list containing the following elements:
  \itemize{
  \item run list, metadata about the run.
  \item data list, metadata about the training data.
  \item model list, the fitted scikit-learn model.
  \item metrics empty list.
  \item warnings list.
  \item data_platform list, training data location.
  }}
}
\description{
An interface for training and scoring data on Civis Platform
using a set of Scikit-Learn estimators.
}
\section{CivisML Workflows}{


You can use the following pre-defined models with \code{civis_ml}. All models
start by imputing missing values with the mean of non-null values in a
column. The \code{"sparse_*"} models include a LASSO regression step
(using \code{glmnet}) to do feature selection before passing data to the
final model. In some models, CivisML uses default parameters from those in
\href{http://scikit-learn.org/stable/}{Scikit-Learn}, as indicated in the "Altered Defaults" column.
All models also have \code{random_state=42}.

Specific workflows can also be called directly using the R workflow functions.

\tabular{rrrrr}{
 Name \tab R Workflow \tab Model Type \tab Algorithm \tab Altered Defaults \cr
 \code{sparse_logistic}	\tab \code{\link{civis_ml_sparse_logistic}} \tab classification	\tab \href{http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html}{LogisticRegression}	\tab \code{C=499999950, tol=1e-08} \cr
 \code{gradient_boosting_classifier} \tab	\code{\link{civis_ml_gradient_boosting_classifier}} \tab classification \tab	\href{http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html}{GradientBoostingClassifier} \tab	\code{n_estimators=500, max_depth=2} \cr
 \code{random_forest_classifier} \tab	\code{\link{civis_ml_random_forest_classifier}} \tab classification \tab	\href{http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html}{RandomForestClassifier} \tab	\code{n_estimators=500} \cr
 \code{extra_trees_classifier} \tab	\code{\link{civis_ml_extra_trees_classifier}} \tab classification \tab	\href{http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html}{ExtraTreesClassifier} \tab	\code{n_estimators=500} \cr
 \code{multilayer_perceptron_classifier} \tab \tab classification \tab \href{https://github.com/civisanalytics/muffnn}{muffnn.MLPClassifier} \tab \cr
 \code{stacking_classifier} \tab \tab classification  \tab \href{https://github.com/civisanalytics/civisml-extensions}{StackedClassifier}\tab \cr
 \code{sparse_linear_regressor} \tab \code{\link{civis_ml_sparse_linear_regressor}} \tab	regression \tab	\href{http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html}{LinearRegression} \tab \cr
 \code{sparse_ridge_regressor} \tab	\code{\link{civis_ml_sparse_ridge_regressor}} \tab regression \tab	\href{http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html}{Ridge} \tab \cr
 \code{gradient_boosting_regressor}	\tab \code{\link{civis_ml_gradient_boosting_regressor}} \tab regression \tab \href{http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html}{GradientBoostingRegressor} \tab \code{n_estimators=500, max_depth=2} \cr
 \code{random_forest_regressor}	\tab \code{\link{civis_ml_random_forest_regressor}} \tab regression \tab \href{http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html}{RandomForestRegressor} \tab \code{n_estimators=500} \cr
 \code{extra_trees_regressor} \tab \code{\link{civis_ml_extra_trees_regressor}} \tab regression	\tab \href{http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html}{ExtraTreesRegressor} \tab \code{n_estimators=500} \cr
 \code{multilayer_perceptron_regressor} \tab \tab regression \tab \href{https://github.com/civisanalytics/muffnn}{muffnn.MLPRegressor} \tab \cr
 \code{stacking_regressor} \tab \tab regression  \tab \href{https://github.com/civisanalytics/civisml-extensions}{StackedRegressor}\tab \cr
}
Model names can be easily accessed using the global variables \code{CIVIS_ML_REGRESSORS} and \code{CIVIS_ML_CLASSIFIERS}.
}

\section{Stacking}{


The \code{"stacking_classifier"} model stacks together the \code{"gradient_boosting_classifier"} and
\code{"random_forest_classifier"} predefined models together with a
\code{glmnet.LogitNet(alpha=0, n_splits=4, max_iter=10000, tol=1e-5, scoring='log_loss')}.
Defaults for the predefined models are documented in \code{?civis_ml}. Each column is first
\href{http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html}{standardized},
and then the model predictions are combined using
\href{http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html}{LogisticRegressionCV}
with \code{penalty='l2'} and \code{tol=1e-08}. The \code{"stacking_regressor"} works similarly, stacking together
the \code{"gradient_boosting_regressor"} and \code{"random_forest_regressor"} models and a
\code{glmnet.ElasticNet(alpha=0, n_splits=4, max_iter=10000, tol=1e-5, scoring='r2')}, combining them using
\href{https://github.com/civisanalytics/civisml-extensions}{NonNegativeLinearRegression}.
 The estimators that are being stacked have the same names as the
associated pre-defined models, and the meta-estimator steps are named
"meta-estimator". Note that although default parameters are provided
for multilayer perceptron models, it is highly recommended that
multilayer perceptrons be run using hyperband.
}

\section{Hyperparameter Tuning}{

You can tune hyperparameters using one of two methods: grid search or
hyperband. CivisML will perform grid search if you pass a list
of hyperparameters to the \code{cross_validation_parameters} parameter, where list elements are
hyperparameter names, and the values are vectors of hyperparameter
values to grid search over. You can run hyperparameter optimization in parallel by
setting the \code{n_jobs}
parameter to however many jobs you would like to run in
parallel. By default, \code{n_jobs} is dynamically calculated based on
the resources available on your cluster, such that a modeling job will
never take up more than 90% of the cluster resources at once.

 \href{https://arxiv.org/abs/1603.06560}{Hyperband}
is an efficient approach to hyperparameter optimization, and
recommended over grid search where possible. CivisML will perform
hyperband optimization if you pass the string \code{"hyperband"} to
\code{cross_validation_parameters}. Hyperband is currently only supported for the following models:
\code{"gradient_boosting_classifier"}, \code{"random_forest_classifier"},
\code{"extra_trees_classifier"}, \code{"multilayer_perceptron_classifier"},
\code{"stacking_classifier"},
\code{"gradient_boosting_regressor"}, \code{"random_forest_regressor"},
\code{"extra_trees_regressor"}, \code{"multilayer_perceptron_regressor"},
and \code{"stacking_regressor"}.

Hyperband cannot be used to tune GLMs. For this reason, preset GLMs do
not have a hyperband option. Similarly, when
\code{cross_validation_parameters='hyperband'} and the model is
\code{stacking_classifier} or \code{stacking_regressor}, only the GBT and
random forest steps of the stacker are tuned using hyperband. For the specific
distributions used in the predefined hyperband models, see
\href{https://civis-python.readthedocs.io/en/stable/ml.html#hyperparameter-tuning}{the detailed table in the Python client documentation}.
}

\section{Data Sources}{


For building models with \code{civis_ml}, the training data can reside in
four different places, a file in the Civis Platform, a CSV or feather-format file
on the local disk, a \code{data.frame} resident in local the R environment, and finally,
a table in the Civis Platform. Use the following helpers to specify the
data source when calling \code{civis_ml}:

\describe{
  \item{\code{data.frame}}{\code{civis_ml(x = df, ...)}}
  \item{local csv file}{\code{civis_ml(x = "path/to/data.csv", ...)}}
  \item{file in Civis Platform}{\code{civis_ml(x = civis_file(1234))}}
  \item{table in Civis Platform}{\code{civis_ml(x = civis_table(table_name = "schema.table", database_name = "database"))}}
}
}

\section{Out of sample scores}{

Model outputs will always contain out-of-sample (or out of fold) scores,
which are accessible through \code{\link{fetch_oos_scores}}.
These may be stored in a Civis table on Redshift using the
\code{oos_scores}, \code{oos_scores_db}, and \code{oos_scores_if_exists} parameters.
}

\section{Predictions}{


A fitted model can be used to make predictions for data residing in any of
the sources above and a \code{\link{civis_file_manifest}}. Similar to
\code{civis_ml}, use the data source helpers as the \code{newdata} argument
to \code{predict.civis_ml}.

A manifest file is a JSON file which specifies the location of many shards of the data to be used for prediction.
A manifest file is the output of a Civis export job with \code{force_multifile = TRUE} set, e.g.
from \code{\link{civis_to_multifile_csv}}. Large civis tables (provided using \code{table_name})
will automatically be exported to manifest files.

Prediction outputs will always be stored as gzipped CSVs in one or more civis files.
Provide an \code{output_table} (and optionally an \code{output_db},
if it's different from \code{database_name}) to copy these predictions into a
table on Redshift.
}

\examples{
\dontrun{
# From a data frame:
m <- civis_ml(df, model_type = "sparse_logistic",
              dependent_variable = "Species")

# From a table:
m <- civis_ml(civis_table("schema.table", "database_name"),
              model_type = "sparse_logistic", dependent_variable = "Species",
              oos_scores_table = "schema.scores_table",
              oos_scores_if_exists = "drop")

# From a local file:
m <- civis_ml("path/to/file.csv", model_type = "sparse_logistic",
              dependent_variable = "Species")

# From a Civis file:
file_id <- write_civis_file("path/to/file.csv", name = "file.csv")
m <- civis_ml(civis_file(file_id), model_type = "sparse_logistic",
              dependent_variable = "Species")

pred_job <- predict(m, newdata = df)
pred_job <- predict(m, civis_table("schema.table", "database_name"),
                    output_table = "schema.scores_table")
pred_job <- predict(m, civis_file(file_id),
                    output_table = "schema.scores_table")

m <- civis_ml_fetch_existing(model_id = m$job$id, m$run$id)
logs <- fetch_logs(m)
yhat <- fetch_oos_scores(m)
yhat <- fetch_predictions(pred_job)
}
}
\seealso{
\code{\link{civis_file}}, \code{\link{civis_table}}, and
  \code{\link{civis_file_manifest}} for specifying data sources.

  \code{\link{get_metric}} to access model validation metrics.

  \code{\link{fetch_logs}} for retrieving logs for a (failed) model build,
  \code{\link{fetch_oos_scores}} for retrieving the out of sample (fold) scores for each training observation, and
  \code{\link{fetch_predictions}} for retrieving the predictions from a prediction job.
}
