% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/genthin.R
\name{thin_diff}
\alias{thin_diff}
\title{Binomial thinning for differential expression analysis.}
\usage{
thin_diff(
  mat,
  design_fixed = NULL,
  coef_fixed = NULL,
  design_perm = NULL,
  coef_perm = NULL,
  target_cor = NULL,
  use_sva = FALSE,
  design_obs = NULL,
  relative = TRUE,
  change_colnames = TRUE,
  permute_method = c("hungarian", "marriage"),
  type = c("thin", "mult")
)
}
\arguments{
\item{mat}{A numeric matrix of RNA-seq counts. The rows index the genes and
the columns index the samples.}

\item{design_fixed}{A numeric design matrix whose rows are fixed and not
to be permuted. The rows index the samples and the columns index the
variables. The intercept should \emph{not} be included
(though see Section "Unestimable Components").}

\item{coef_fixed}{A numeric matrix. The coefficients corresponding to
\code{design_fixed}. The rows index the genes and the columns index
the variables.}

\item{design_perm}{A numeric design matrix whose rows are to be permuted
(thus controlling the amount by which they are correlated with the
surrogate variables). The rows index the samples and the columns index
the variables. The intercept should \emph{not} be included
(though see Section "Unestimable Components").}

\item{coef_perm}{A numeric matrix. The coefficients corresponding to
\code{design_perm}. The rows index the genes and the columns index
the variables.}

\item{target_cor}{A numeric matrix of target correlations between the
variables in \code{design_perm} and the surrogate variables. The
rows index the observed covariates and the columns index the surrogate
variables. That is, \code{target_cor[i, j]} specifies the target
correlation between the \code{i}th column of \code{design_perm} and the
\code{j}th surrogate variable. The surrogate variables are estimated
either using factor analysis or surrogate variable analysis (see the
parameter \code{use_sva}).
The number of columns in \code{target_cor} specifies the number of
surrogate variables. Set \code{target_cor} to \code{NULL} to indicate
that \code{design_perm} and the surrogate variables are independent.}

\item{use_sva}{A logical. Should we use surrogate variable analysis
(Leek and Storey, 2008) using \code{design_obs}
to estimate the hidden covariates (\code{TRUE})
or should we just do an SVD on \code{log2(mat + 0.5)} after
regressing out \code{design_obs} (\code{FALSE})? Setting this to
\code{TRUE} allows the surrogate variables to be correlated with the
observed covariates, while setting this to \code{FALSE} assumes that
the surrogate variables are orthogonal to the observed covariates. This
option only matters if \code{design_obs} is not \code{NULL}.
Defaults to \code{FALSE}.}

\item{design_obs}{A numeric matrix of observed covariates that are NOT to
be a part of the signal generating process. Only used in estimating the
surrogate variables (if \code{target_cor} is not \code{NULL}).
The intercept should \emph{not} be included (it will sometimes
produce an error if it is included).}

\item{relative}{A logical. Should we apply relative thinning (\code{TRUE})
or absolute thinning (\code{FALSE}). Only experts should change
the default.}

\item{change_colnames}{A logical. Should we change the column-names
of the design matrices (\code{TRUE}) or not (\code{FALSE})?
Each new column name begins with either "O" (observed), "P" (permuted),
or "F" (fixed), followed by a number. The letters correspond to
whether the variables come from \code{design_obs}, \code{design_perm},
or \code{design_fixed}. Setting this to \code{TRUE}
also changes the column-names of the corresponding coefficient matrices.
Defaults to \code{TRUE}.}

\item{permute_method}{Should we use the Gale-Shapley algorithm
for stable marriages (\code{"marriage"}) (Gale and Shapley, 1962)
as implemented in the matchingR package, or the Hungarian algorithm
(Papadimitriou and Steiglitz, 1982) (\code{"hungarian"})
as implemented in the clue package (Hornik, 2005)? The
Hungarian method almost always works better, so is the default.}

\item{type}{Should we apply binomial thinning (\code{type = "thin"}) or
just naive multiplication of the counts (\code{type = "mult"}).
You should always have this set to \code{"thin"}.}
}
\value{
A list-like S3 object of class \code{ThinData}.
Components include some or all of the following:
\describe{
  \item{\code{mat}}{The modified matrix of counts.}
  \item{\code{designmat}}{The design matrix of variables used to simulate
      signal. This is made by column-binding \code{design_fixed} and the
      permuted version of \code{design_perm}.}
  \item{\code{coefmat}}{A matrix of coefficients corresponding to
      \code{designmat}.}
  \item{\code{design_obs}}{Additional variables that should be included in
      your design matrix in downstream fittings. This is made by
      column-binding the vector of 1's with \code{design_obs}.}
  \item{\code{sv}}{A matrix of estimated surrogate variables. In simulation
      studies you would probably leave this out and estimate your own
      surrogate variables.}
  \item{\code{cormat}}{A matrix of target correlations between the
      surrogate variables and the permuted variables in the design matrix.
      This might be different from the \code{target_cor} you input because
      we pass it through \code{\link{fix_cor}} to ensure
      positive semi-definiteness of the resulting covariance matrix.}
  \item{\code{matching_var}}{A matrix of simulated variables used to
      permute \code{design_perm} if the \code{target_cor} is not
      \code{NULL}.}
}
}
\description{
Given a matrix of real RNA-seq counts, this function will add a known
amount of signal to the count matrix. This signal is given in the form
of a Poisson / negative binomial / mixture of negative binomials
generalized linear model with a log (base 2) link. The user may
specify any arbitrary design matrix and coefficient matrix. The user
may also control for the amount of correlation between the observed
covariates and any unobserved surrogate variables. The method is
described in detail in Gerard (2020).
}
\section{Mathematical Formulation}{

Let
\describe{
  \item{\eqn{N}}{Be the number of samples.}
  \item{\eqn{G}}{Be the number of genes.}
  \item{\eqn{Y}}{Be an \eqn{G} by \eqn{N} matrix of real RNA-seq counts.
      This is \code{mat}.}
  \item{\eqn{X_1}}{Be an \eqn{N} by \eqn{P_1} user-provided design matrix.
      This is \code{design_fixed}.}
  \item{\eqn{X_2}}{Be an \eqn{N} by \eqn{P_2} user-provided design matrix.
      This is \code{design_perm}.}
  \item{\eqn{X_3}}{Be an \eqn{N} by \eqn{P_3} matrix of known covariates.
      This is \code{design_obs}.}
  \item{\eqn{Z}}{Be an \eqn{N} by \eqn{K} matrix of unobserved surrogate
       variables. This is estimated when \code{target_cor} is not
       \code{NULL}.}
  \item{\eqn{M}}{Be a \eqn{G} by \eqn{N} of additional (unknown)
       unwanted variation.}
}
We assume that \eqn{Y} is Poisson distributed given \eqn{X_3} and
\eqn{Z} such that
\deqn{\log_2(EY) = \mu 1_N' + B_3X_3' + AZ' + M.}
\code{thin_diff()} will take as input \eqn{X_1}, \eqn{X_2}, \eqn{B_1},
\eqn{B_2}, and will output a \eqn{\tilde{Y}} and \eqn{W} such that
\eqn{\tilde{Y}} is Poisson distributed given \eqn{X_1}, \eqn{X_2}, \eqn{X_3},
\eqn{W}, \eqn{Z}, and \eqn{M} such that
\deqn{\log_2(E\tilde{Y}) \approx \tilde{\mu}1_N' + B_1X_1' + B_2X_2'W' + B_3X_3' + AZ' + M,}
where \eqn{W} is an \eqn{N} by \eqn{N} permutation matrix. \eqn{W} is randomly
drawn so that \eqn{WX_2} and \eqn{Z} are correlated approximately according
to the target correlation matrix.

The Poisson assumption may be generalized to a mixture of negative binomials.
}

\section{Unestimable Components}{


It is possible to include an intercept term or a column from
\code{design_obs} into either \code{design_fixed} or \code{design_perm}.
This will not produce an error and the specified thinning will be applied.
However, If any column of \code{design_fixed} or
\code{design_perm} is a vector of ones or contains a column from
\code{design_obs}, then the corresponding columns in \code{coef_fixed}
or \code{coef_perm} cannot be estimated by \emph{any} method. This is
represented in the output by having duplicate columns in
\code{designmat} and \code{design_obs}.

Including duplicate columns in \code{design_fixed} and \code{design_perm}
is also allowed but, again, will produce unestimable coefficients.

Including an intercept term in \code{design_obs} will produce an error if
you are specifying correlated surrogate variables.
}

\examples{
## Generate simulated data with surrogate variables
## In practice, you would obtain mat from a real dataset, not simulate it.
set.seed(1)
n <- 10
p <- 1000
Z <- matrix(abs(rnorm(n, sd = 4)))
alpha <- matrix(abs(rnorm(p, sd = 1)))
mat <- round(2^(alpha \%*\% t(Z) + abs(matrix(rnorm(n * p, sd = 5),
                                            nrow = p,
                                            ncol = n))))

## Choose simulation parameters
design_perm <- cbind(rep(c(0, 1), length.out = n), runif(n))
coef_perm <- matrix(rnorm(p * ncol(design_perm), sd = 6), nrow = p)

## Specify one surrogate variable (number of columns in taget_cor),
## highly correlated with first observed covariate and uncorrelated
## with second observed covariate
target_cor <- matrix(c(0.9, 0))

## Thin
thout <- thin_diff(mat = mat,
                   design_perm = design_perm,
                   coef_perm = coef_perm,
                   target_cor = target_cor)

## target_cor approximates correlation between estimated surrogate variable
## and matching variable.
cor(thout$matching_var, thout$sv)

## Estimated surrogate variable is associated with true surrogate variable
## (because the signal is strong in this case)
plot(Z, thout$sv, xlab = "True SV", ylab = "Estimated SV")

## So target_cor approximates correlation between surrogate variable and
## matching variables
cor(thout$matching_var, Z)

## Correlation between permuted covariates and surrogate variables are less
## close to target_cor
cor(thout$designmat, Z)

## Estimated signal is correlated to true single. First variable is slightly
## biased because the surrogate variable is not included.
Ynew <- log2(t(thout$mat) + 0.5)
X <- thout$designmat
coef_est <- t(coef(lm(Ynew ~ X))[2:3, ])

plot(thout$coefmat[, 1], coef_est[, 1],
     main = "First Variable",
     xlab = "Coefficient",
     ylab = "Estimated Coefficient")
abline(0, 1, col = 2, lwd = 2)

plot(thout$coefmat[, 2], coef_est[, 2],
     main = "Second Variable",
     xlab = "Coefficient",
     ylab = "Estimated Coefficient")
abline(0, 1, col = 2, lwd = 2)

## But estimated coefficient of the first variable is slightly closer when
## the surrogate variable is included.
Ynew <- log2(t(thout$mat) + 0.5)
X <- cbind(thout$designmat, thout$sv)
coef_est <- t(coef(lm(Ynew ~ X))[2:3, ])

plot(thout$coefmat[, 1], coef_est[, 1],
     main = "First Variable",
     xlab = "Coefficient",
     ylab = "Estimated Coefficient")
abline(0, 1, col = 2, lwd = 2)

plot(thout$coefmat[, 2], coef_est[, 2],
     main = "Second Variable",
     xlab = "Coefficient",
     ylab = "Estimated Coefficient")
abline(0, 1, col = 2, lwd = 2)

}
\references{
\itemize{
  \item{Gale, David, and Lloyd S. Shapley. "College admissions and the stability of marriage." \emph{The American Mathematical Monthly} 69, no. 1 (1962): 9-15. \doi{10.1080/00029890.1962.11989827}.}
  \item{Gerard, D (2020). "Data-based RNA-seq simulations by binomial thinning." \emph{BMC Bioinformatics}. 21(1), 206. \doi{10.1186/s12859-020-3450-9}.}
  \item{Hornik K (2005). "A CLUE for CLUster Ensembles." \emph{Journal of Statistical Software}, 14(12). \doi{10.18637/jss.v014.i12}.}
  \item{Leek, Jeffrey T., and John D. Storey. "A general framework for multiple testing dependence." \emph{Proceedings of the National Academy of Sciences} 105, no. 48 (2008): 18718-18723. \doi{10.1073/pnas.0808709105}.}
  \item{C. Papadimitriou and K. Steiglitz (1982), Combinatorial Optimization: Algorithms and Complexity. Englewood Cliffs: Prentice Hall.}
}
}
\seealso{
\describe{
  \item{\code{\link{select_counts}}}{For subsampling the rows and columns
      of your real RNA-seq count matrix prior to applying binomial thinning.}
  \item{\code{\link{thin_2group}}}{For the specific application of
      \code{thin_diff} to the two-group model.}
  \item{\code{\link{thin_lib}}}{For the specific application of
      \code{thin_diff} to library size thinning.}
  \item{\code{\link{thin_gene}}}{For the specific application of
      \code{thin_diff} to total gene expression thinning.}
  \item{\code{\link{thin_all}}}{For the specific application of
      \code{thin_diff} to thinning all counts uniformly.}
  \item{\code{\link{thin_base}}}{For the underlying thinning function
      used in \code{thin_diff}.}
  \item{\code{\link[sva]{sva}}}{For the implementation of surrogate
      variable analysis.}
  \item{\code{\link{ThinDataToSummarizedExperiment}}}{For converting a
      ThinData object to a SummarizedExperiment object.}
  \item{\code{\link{ThinDataToDESeqDataSet}}}{For converting a
      ThinData object to a DESeqDataSet object.}
}
}
\author{
David Gerard
}
