% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_cleaning.R
\name{cleaning_data}
\alias{cleaning_data}
\title{Data Cleaning}
\usage{
cleaning_data(dat, target = NULL, x_list = NULL, obs_id = NULL,
  occur_time = NULL, pos_flag = NULL, miss_values = NULL,
  ex_cols = NULL, outlier_proc = TRUE, missing_proc = TRUE,
  default_miss = TRUE, low_var = TRUE, parallel = FALSE,
  note = FALSE, save_data = FALSE, dir_path = tempdir(),
  file_name = NULL)
}
\arguments{
\item{dat}{A data frame with x and target.}

\item{target}{The name of target variable.}

\item{x_list}{A list of x variables.}

\item{obs_id}{The name of ID of observations.Default is NULL.}

\item{occur_time}{The name of occur time of observations.Default is NULL.}

\item{pos_flag}{The value of positive class of target variable, default: "1".}

\item{miss_values}{Other extreme value might be used to represent missing values, e.g: -9999, -9998. These miss_values will be encoded to -1 or "Unknown".}

\item{ex_cols}{A list of excluded variables. Default is NULL.}

\item{outlier_proc}{Logical, process outliers or not. Default is TRUE.}

\item{missing_proc}{Logical, process nas or not. Default is TRUE.}

\item{default_miss}{Logical. If TRUE, assigning the missing values to -1 or "Unknown", otherwise ,processing the missing values according to the results of missing analysis.}

\item{low_var}{Logical, delete low variance variables or not. Default is TRUE.}

\item{parallel}{Logical, parallel computing or not. Default is FALSE.}

\item{note}{Logical. Outputs info. Default is TRUE.}

\item{save_data}{Logical, save the result or not. Default is FALSE.}

\item{dir_path}{The path for periodically saved data file. Default is "./data".}

\item{file_name}{The name for periodically saved data file. Default is NULL.}
}
\value{
A preprocessed data.frame
}
\description{
The \code{cleaning_data} function is a simpler wrapper for data cleaning functions, such as delete variables that values are all NAs;checking dat and target format.;delete low variance variables.;replace null or NULL or blank with NA; encode variables which NAs &  miss value rate is more than 95% as 1,0 ;encode variables which unique value  rate is  more than 95% as 1,0; merge categories of character variables that  is more than 8; transfer time variables to dateformation; remove duplicated observations;process outliers;process NAs.
}
\examples{
#data cleaning
dat_cl <- cleaning_data(dat = UCICreditCard[1:2000,],
                       target = "default.payment.next.month",
                       x_list = NULL,
                       obs_id = "ID",
                       occur_time = "apply_date",
                       ex_cols = c("PAY_6|BILL_"),
                       outlier_proc = TRUE,
                       missing_proc = TRUE,
                       default_miss = FALSE,
                       low_var = TRUE,
                       save_data = FALSE)

}
\seealso{
\code{\link{remove_duplicated}},
\code{\link{null_blank_na}},
\code{\link{entry_rate_max}},
\code{\link{entry_rate_na}},
\code{\link{low_variance_filter}},
\code{\link{process_nas}},
\code{\link{process_outliers}}
}
