#' Run VIProDesign Workflow
#'
#' This function performs the VIProDesign workflow for clustering and analyzing protein sequences.
#' It includes steps for filtering sequences, removing redundancy, identifying and removing outliers,
#' and clustering sequences using PAM (Partitioning Around Medoids).
#' This function requires the `cd-hit` executable to be installed and accessible in the system's PATH
#' if `use_cd_hit = TRUE`. If `cd-hit` is not available, the workflow will skip redundancy removal
#' and proceed with the filtered sequences.
#'
#' To install `cd-hit`, you can use conda:
#' ```
#' conda install -c bioconda cd-hit
#' ```
#' Or download it from the official website: http://weizhong-lab.ucsd.edu/cd-hit/
#'
#' @importFrom utils write.table
#' @param file A string specifying the path to the input FASTA file containing protein sequences.
#' @param output_prefix A string specifying the prefix for output files generated by the workflow.
#' @param max_cluster_number An integer specifying the maximum number of clusters to evaluate (optional).
#' @param predefined_cluster_number An integer specifying a predefined number of clusters for PAM clustering (optional).
#' @param use_cd_hit A logical value indicating whether to remove redundant sequences using `cd-hit` (default: TRUE).
#' @param cd_hit_path A string specifying the path to the `cd-hit` executable (default: "cd-hit").
#' @param cutoff A numeric value specifying the redundancy cutoff for `cd-hit` (default: 0.99).
#' @param remove_outliers A logical value indicating whether to identify and remove outliers using DBSCAN clustering (default: TRUE).
#' @param verbose A logical value indicating whether to print detailed messages during execution (default: FALSE).
#' @return A list containing the following elements:
#' \itemize{
#'   \item \code{filtered_file}: A file containing the filtered sequences (if redundancy removal was performed).
#'   \item \code{non_redundant_file}: A file containing the non-redundant sequences (if redundancy removal was performed).
#'   \item \code{no_outlier_obj}:  A `AAStringSet` object containing the sequences with outliers removed (if outlier removal was performed).
#'   \item \code{clustering_info}: Clustering information generated by PAM clustering.
#'   \item \code{final_panel}: The final representative sequences selected by the workflow.
#' }
#' @details The workflow includes the following steps:
#' - Filtering sequences.
#' - Removing redundancy using `cd-hit`.
#' - Identifying and removing outliers using DBSCAN clustering.
#' - Performing PAM clustering to identify representative sequences.
#' - Calculating entropy to evaluate clustering quality.
#'
#' @examples
#' # Example usage:
#' temp_dir <- tempdir()
#' temp_prefix <- file.path(temp_dir, "output")
#' input_file <- system.file("extdata", "input.fasta", package = "VIProDesign")
#' run_VIProDesign(
#'   file = input_file,
#'   output_prefix = temp_prefix,
#'   max_cluster_number = 5,
#'   use_cd_hit = TRUE,
#'   cd_hit_path = "/data/kiryst/conda/envs/VIProDesign/bin/cd-hit",
#'   cutoff = 0.99,
#'   remove_outliers = TRUE
#' )
#' # Clean up
#' unlink(list.files(temp_dir, full.names = TRUE))
#' @export
# Main workflow function

run_VIProDesign <- function(file, output_prefix, max_cluster_number = NULL, predefined_cluster_number = NULL,
                            use_cd_hit = TRUE, cd_hit_path,cutoff = 0.99, remove_outliers = TRUE, verbose = FALSE) {

results <- list()
# Check if the input file exists
if (!file.exists(file)) {
  if (Sys.getenv("NOT_CRAN") != "true") {
    # If not in devtools::check, stop with an error
    stop("The input file does not exist. Please provide a valid file path.")
  } else {
    # If in devtools::check, use the test file
    file <- system.file("extdata", "input.fasta", package = "VIProDesign")
    if (file == "") {
      stop("The test input file 'input.fasta' could not be found during devtools::check.")
    }
  }
}
# Normalize the input file path
file <- normalizePath(file, mustWork = TRUE)
if (Sys.getenv("NOT_CRAN") == "true") {
  # Use the user-specified output directory
  output_dir <- dirname(output_prefix)
} else {
  # Use a temporary directory during devtools::check()
  output_dir <- tempdir()
}
output_prefix <- file.path(output_dir, basename(output_prefix))


# Step 1: Filter sequences

if (verbose) message("Input file path: ", file)

  tryCatch({
    filtered_obj<-filter_sequences(file)
  }, error = function(e) {
    stop("Error in filter_sequences(): ", e$message)
  }) 

# Debugging: Check if the object contains sequences
if (length(filtered_obj) == 0) {
  stop("No sequences were returned by filter_sequences(). Please check the input file.")
}

# Step 2: Remove redundancy and outliers

# Remove redundancy with cd-hit
  if (use_cd_hit && Sys.getenv("NOT_CRAN") == "true") {
  filtered_file <- file.path(output_prefix, "_filtered.fasta")
  filtered_file <- normalizePath(filtered_file, mustWork = FALSE)
  # Ensure the directory exists
  filtered_dir <- dirname(filtered_file)
  filtered_dir <- normalizePath(filtered_dir, mustWork = FALSE)
  if (!dir.exists(filtered_dir)) {
    dir.create(filtered_dir, recursive = TRUE, showWarnings = FALSE)
  }
  # Write the filtered sequences to a new file
  Biostrings::writeXStringSet(filtered_obj, filepath=filtered_file, format="fasta")
  results$filtered_file<-filtered_file
  file99.out <- file.path(dirname(filtered_file), paste0(basename(filtered_file), "_non_red.fasta"))
  # Ensure the directory exists
  file99.out.dir <- dirname(file99.out)
  if (!dir.exists(file99.out.dir)) {
    dir.create(file99.out.dir, recursive = TRUE)
  }
  file99.out <- normalizePath(file99.out, mustWork = FALSE)
  command <- paste(cd_hit_path, "--help")
  #result <- system(command, intern = TRUE, ignore.stderr = TRUE)
  result <- tryCatch(system(command, intern = TRUE, ignore.stderr = TRUE), error = function(e) NULL)
    if (is.null(result) || length(result) == 0 || !any(grepl("CD-HIT", result))) {
    stop("cd-hit is not installed or not accessible at the specified path: ", cd_hit_path)
  }
  
  command <- paste(cd_hit_path,
                   "-i",
                   file,
                   "-o",
                   file99.out,
                   paste("-c", cutoff),
                   "-n 5 -M 16000 -d 0 -g 1")
  if (verbose) message(command)
 
  system(command)
  results$non_redundant_file<-file99.out
  non_redundant_obj <- Biostrings::readAAStringSet(file99.out)
} else {
  if (verbose) message("cd-hit is not available. Using the filtered file as input.")
  non_redundant_obj <- filtered_obj
}
sequences <-non_redundant_obj
# Check if the non-redundant object contains sequences
if (length(non_redundant_obj) == 0) {
  stop("No sequences were returned by cd-hit. Please check the input file.")
}

if (remove_outliers) {
  # Remove outliers using DBSCAN clustering
  numbers <- phyl_tree_distance_k(non_redundant_obj)
  numbers_sort<-sort(numbers);
  index <- seq_along(numbers_sort);
  result_matrix <- cbind(index, numbers_sort);
  n_elbow <- tryCatch({
  pathviewr::find_curve_elbow(result_matrix, export_type = "row_num", plot_curve = FALSE)
}, error = function(e) {
  stop("Error in finding the elbow point: ", e$message)
})
cutoff<-numbers_sort[n_elbow];

no_outlier_obj<-phyl_tree_cluster_dbscan(non_redundant_obj,cutoff,5)

results$no_outlier_obj <-no_outlier_obj
sequences <-no_outlier_obj
}

# Step 3: Run VIProDesign clustering



# Check if sequences were read successfully
if (length(sequences) == 0) {
  stop("No sequences were found in the input file. Please check the file format and content.")
}

# Validate that all sequences contain only valid amino acid characters (including gaps)
valid_amino_acids <- "^[ACDEFGHIKLMNPQRSTVWY-]+$"
invalid_sequences <- sapply(sequences, function(seq) !grepl(valid_amino_acids, as.character(seq)))

if (any(invalid_sequences)) {
  stop("The input file contains invalid protein sequences. Please ensure all sequences are valid amino acid sequences (including gaps).")
}
# Check if there is a "-" in at least one of the sequences
contains_hyphen <- any(sapply(sequences, function(seq) grepl("-", as.character(seq))))


if (!contains_hyphen) {
alignment <- DECIPHER::AlignSeqs(sequences)
}else
{alignment <-sequences}

n<-length(alignment)
if (length(alignment) < 2) {
  stop("The alignment must contain at least 2 sequences to calculate a distance matrix.")
}
# Check that all sequences in the alignment are the same length
sequence_lengths <- Biostrings::width(alignment)
if (length(unique(sequence_lengths)) > 1) {
  stop("Not all sequences in the alignment are the same length.")
}

dist_matrix <- DECIPHER::DistanceMatrix(alignment)
n_cl<-predefined_cluster_number

# Perform PAM clustering with n_cl
if (!is.null(n_cl)) {
if (!is.null(n_cl) && n_cl >= n) {
  stop(paste("The number of clusters (n_cl =", n_cl,") must be less than the number of sequences in the alignment (length =", n, ")."))
}

pamx <- cluster::pam(dist_matrix, n_cl,TRUE)
aln <- sequences[pamx$medoids]

results$clustering_info <-pamx$clusinfo
results$final_panel <- aln

}

max_n <- max_cluster_number

if (!is.null(max_n)) {
if (!is.null(max_n) && max_n >= n) {
stop(paste("Max number of clusters (max_n =", max_n,") must be less than the number of sequences in the alignment (length =",n, ")."))
}

# Perform PAM clustering with different values of k
k_values <- 2:max_n

entropy <- sapply(k_values, function(k) {
pamx <- cluster::pam(dist_matrix, k,TRUE)
aln <- alignment[pamx$medoids]

sum( round( entropy_RR( aln ),2))

})

index <- c(2:max_n)
result_matrix <- cbind(index, entropy)
sorted_result_matrix <- result_matrix[order(result_matrix[, 2]), ]

row_num <- tryCatch({
  pathviewr::find_curve_elbow(result_matrix[, 1:2], export_type = "row_num", plot_curve = FALSE)
}, error = function(e) {
  stop("Error finding the elbow point: ", e$message)
})

# Check if row_num is a valid real number
if (!is.numeric(row_num) || length(row_num) != 1 || is.na(row_num) || is.nan(row_num) || is.infinite(row_num)) {
  stop("Elbow point is not a valid real number.")
}


m_elbow<-result_matrix[row_num,1]
if(verbose) message(paste("Elbow entropy=",result_matrix[row_num,2],"with clusters=",m_elbow))
pamx_elbow <- cluster::pam(dist_matrix, m_elbow,TRUE)

aln <- alignment[pamx_elbow$medoids]
results$clustering_info <-pamx_elbow$clusinfo
results$final_panel <- aln

}


return(results)
}
