Overview

opencesp provides tools to synthesize tabular datasets and to evaluate synthetic data from both utility and disclosure risk viewpoints. The emphasis is on the flexibility and accessibility of the methods proposed, hence most of these tools are nonparametric. The synthesis approaches that are targeted by the package are primarily those in which the synthetic distribution is specified in terms of conditional laws. This includes sequential as well as fully conditional specifications.

Installation

The released version of the package can be installed from CRAN with:

install.packages("opencesp")

Main functions

Example

The example below demonstrates how the package can be used to synthesize the iris dataset by sequential specification, and evaluate the risk-utility tradeoff provided by the synthesis.

library(opencesp)
set.seed(1234)

orig <- iris[sapply(iris, is.numeric)]

score_pgb <- function(x, y) {
  if(is.null(x)) {
    d <- density(y, bw = "nrd0")
    return(sum(log(approx(d$x, d$y, xout = y, rule = 2)$y)))
  }
  fit <- pgb_cvh(y ~ ., data = cbind.data.frame(y, x), subsample = 1, select_h = "none")
  cde <- predict_cde_pgb_raw(fit, x, y)
  sum(log(diag(cde)))
}

ord <- dep_order(orig, score_func = score_pgb)
vars <- names(orig)[ord]

tree_grid <- c(25, 150, 300, 600)

# Fit one complete factorization of the joint distribution for each tree budget.
fits_by_tree <- lapply(tree_grid, function(nt) {
  fits <- vector("list", length(vars))
  names(fits) <- vars
  
  fits[[1]] <- density(orig[[vars[1]]], bw = "nrd0")
  
  for(j in seq_along(vars)[-1]) {
    formula_j <- reformulate(vars[seq_len(j - 1)], response = vars[j])
    
    fits[[j]] <- pgb(
      formula_j,
      data = orig[, vars, drop = FALSE],
      pvalid = 0,
      ntrees = nt,
      subsample = 1
    )
  }
  
  fits
})
names(fits_by_tree) <- tree_grid

nsynth <- 10

synth_list <- lapply(fits_by_tree, function(fits) {
  replicate(nsynth, simplify = FALSE, expr = {
    syn <- orig[rep(1, nrow(orig)), vars, drop = FALSE]
    
    # Marginal draw from the reference-bandwidth kernel density estimate.
    syn[[1]] <- sample(orig[[vars[1]]], nrow(orig), replace = TRUE) + rnorm(nrow(orig), sd = fits[[1]]$bw)
    
    for(j in seq_along(vars)[-1]) {
      # Raw conditional draw: choose a quantile interval, then sample uniformly in it.
      qhat <- predict(fits[[j]], syn[, vars[seq_len(j - 1)], drop = FALSE])
      seg <- sample.int(ncol(qhat) - 1, nrow(syn), replace = TRUE, prob = diff(fits[[j]]$targets))
      
      lo <- qhat[cbind(seq_len(nrow(syn)), seg)]
      hi <- qhat[cbind(seq_len(nrow(syn)), seg + 1)]
      
      syn[[j]] <- ifelse(hi > lo, runif(nrow(syn), lo, hi), lo)
    }
    
    round_synth(orig[, vars, drop = FALSE], syn)[, names(orig), drop = FALSE]
  })
})

ru <- data.frame(
  ntrees = tree_grid,
  utility = sapply(synth_list, function(synth_l) {
    mean(sapply(synth_l, function(synth) 1 - 2 * abs(tsAUC(orig, synth)$auc-0.5)))
  }),
  risk = sapply(synth_list, function(synth_l) {
    mean(sapply(synth_l, function(synth) adaptive_matches_prop(orig, synth)))
  })
)

# Plot the risk-utility map for the ts-AUC and the adaptive proportion of matches
plot(ru$utility, ru$risk, pch = 19, xlab = "Utility", ylab = "Risk")

text(ru$utility, ru$risk, labels = ru$ntrees, pos = 2)
lines(ru$utility, ru$risk, lty = 2, col = "grey70")